source: main/trunk/greenstone2/perllib/inexport.pm@ 37178

Last change on this file since 37178 was 37178, checked in by davidb, 15 months ago

Fixed mistake in test. If should be keepold and *replaceold*, not *removeold*.

  • Property svn:executable set to *
File size: 53.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use DocHistoryFileUtils;
50use FileUtils;
51
52use File::Basename;
53
54my $oidtype_list =
55 [ { 'name' => "hash",
56 'desc' => "{import.OIDtype.hash}" },
57 { 'name' => "hash_on_full_filename",
58 'desc' => "{import.OIDtype.hash_on_full_filename}" },
59 { 'name' => "assigned",
60 'desc' => "{import.OIDtype.assigned}" },
61 { 'name' => "incremental",
62 'desc' => "{import.OIDtype.incremental}" },
63 { 'name' => "filename",
64 'desc' => "{import.OIDtype.filename}" },
65 { 'name' => "dirname",
66 'desc' => "{import.OIDtype.dirname}" },
67 { 'name' => "full_filename",
68 'desc' => "{import.OIDtype.full_filename}" } ];
69
70$inexport::directory_arguments =
71 [
72 { 'name' => "importdir",
73 'desc' => "{import.importdir}",
74 'type' => "string",
75 'reqd' => "no",
76 'deft' => "import",
77 'hiddengli' => "yes" },
78 { 'name' => "collectdir",
79 'desc' => "{import.collectdir}",
80 'type' => "string",
81 # parsearg left "" as default
82 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
83 'deft' => "",
84 'reqd' => "no",
85 'hiddengli' => "yes" },
86
87 ];
88$inexport::arguments =
89 [
90 # don't set the default to hash - want to allow this to come from
91 # entry in collect.cfg but want to override it here
92 { 'name' => "OIDtype",
93 'desc' => "{import.OIDtype}",
94 'type' => "enum",
95 'list' => $oidtype_list,
96 'deft' => "hash_on_full_filename",
97 'reqd' => "no",
98 'modegli' => "2" },
99 { 'name' => "OIDmetadata",
100 'desc' => "{import.OIDmetadata}",
101 'type' => "string",
102 'deft' => "dc.Identifier",
103 'reqd' => "no",
104 'modegli' => "2" },
105 { 'name' => "site",
106 'desc' => "{import.site}",
107 'type' => "string",
108 'deft' => "",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "manifest",
112 'desc' => "{import.manifest}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no",
116 'hiddengli' => "yes" } ,
117 { 'name' => "incremental",
118 'desc' => "{import.incremental}",
119 'type' => "flag",
120 'hiddengli' => "yes" },
121 { 'name' => "keepold",
122 'desc' => "{import.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "replaceold",
127 'desc' => "{import.replaceold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "removeold",
132 'desc' => "{import.removeold}",
133 'type' => "flag",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => "language",
137 'desc' => "{scripts.language}",
138 'type' => "string",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "maxdocs",
142 'desc' => "{import.maxdocs}",
143 'type' => "int",
144 'reqd' => "no",
145 'deft' => "-1",
146 'range' => "-1,",
147 'modegli' => "1" },
148 { 'name' => "debug",
149 'desc' => "{import.debug}",
150 'type' => "flag",
151 'reqd' => "no",
152 'hiddengli' => "yes" },
153 { 'name' => "faillog",
154 'desc' => "{import.faillog}",
155 'type' => "string",
156 # parsearg left "" as default
157 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
158 'deft' => "",
159 'reqd' => "no",
160 'modegli' => "3" },
161 { 'name' => "out",
162 'desc' => "{import.out}",
163 'type' => "string",
164 'deft' => "STDERR",
165 'reqd' => "no",
166 'hiddengli' => "yes" },
167 { 'name' => "statsfile",
168 'desc' => "{import.statsfile}",
169 'type' => "string",
170 'deft' => "STDERR",
171 'reqd' => "no",
172 'hiddengli' => "yes" },
173 { 'name' => "verbosity",
174 'desc' => "{import.verbosity}",
175 'type' => "int",
176 'range' => "0,",
177 'deft' => "2",
178 'reqd' => "no",
179 'modegli' => "3" },
180 { 'name' => "gli",
181 'desc' => "{scripts.gli}",
182 'type' => "flag",
183 'reqd' => "no",
184 'hiddengli' => "yes" },
185 { 'name' => "xml",
186 'desc' => "{scripts.xml}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190
191 ];
192
193sub new
194{
195 my $class = shift (@_);
196 my ($mode,$argv,$options,$opt_listall_options) = @_;
197
198 my $self = { 'xml' => 0, 'mode' => $mode };
199
200 # general options available to all plugins
201 my $arguments = $options->{'args'};
202 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
203 # Parse returns -1 if something has gone wrong
204 if ($intArgLeftinAfterParsing == -1)
205 {
206 &PrintUsage::print_txt_usage($options, "{import.params}",1);
207 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
208 die "\n";
209 }
210
211 my $language = $self->{'language'};
212 # If $language has been specified, load the appropriate resource bundle
213 # (Otherwise, the default resource bundle will be loaded automatically)
214 if ($language && $language =~ /\S/) {
215 &gsprintf::load_language_specific_resource_bundle($language);
216 }
217
218 if ($self->{'listall'}) {
219 if ($self->{'xml'}) {
220 &PrintUsage::print_xml_usage($opt_listall_options);
221 }
222 else
223 {
224 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
225 }
226 die "\n";
227 }
228
229 if ($self->{'xml'}) {
230 &PrintUsage::print_xml_usage($options);
231 print "\n";
232 return bless $self, $class;
233 }
234
235 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
236 &gsprintf::output_strings_in_UTF8;
237 }
238
239 # If the user specified -h, then we output the usage
240 if (@$argv && $argv->[0] =~ /^\-+h/) {
241 &PrintUsage::print_txt_usage($options, "{import.params}");
242 die "\n";
243 }
244 # now check that we had exactly one leftover arg, which should be
245 # the collection name. We don't want to do this earlier, cos
246 # -xml arg doesn't need a collection name
247
248 if ($intArgLeftinAfterParsing != 1 )
249 {
250 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
251 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
252 die "\n";
253 }
254
255 $self->{'close_out'} = 0;
256 my $out = $self->{'out'};
257 if ($out !~ /^(STDERR|STDOUT)$/i) {
258 open (OUT, ">$out") ||
259 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
260 $out = 'inexport::OUT';
261 $self->{'close_out'} = 1;
262 }
263 $out->autoflush(1);
264 $self->{'out'} = $out;
265
266 my $statsfile = $self->{'statsfile'};
267 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
268 open (STATSFILE, ">$statsfile") ||
269 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
270 $statsfile = 'inexport::STATSFILE';
271 $self->{'close_stats'} = 1;
272 }
273 $statsfile->autoflush(1);
274 $self->{'statsfile'} = $statsfile;
275
276 # @ARGV should be only one item, the name of the collection
277 $self->{'collection'} = shift @$argv;
278
279 # Unless otherwise stated all manifests are considered version 1---where
280 # they act more like an advanced process expression---as compared to newer
281 # manifest files that act as an explicit (and exhaustive) list of files to
282 # process [jmt12]
283 $self->{'manifest_version'} = 1;
284
285 return bless $self, $class;
286}
287
288# Simplified version of the contstructor for use with CGI scripts
289sub newCGI
290{
291 my $class = shift (@_);
292 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
293
294 my $self = { 'xml' => 0, 'mode' => $mode };
295
296 $self->{'out'} = STDERR;
297
298 if (defined $gsdl_cgi) {
299 $self->{'site'} = $opt_site;
300 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
301 $self->{'collectdir'} = $collect_dir;
302 }
303 else {
304 $self->{'site'} = "";
305 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
306 }
307 $self->{'faillog'} = "";
308
309 $self->{'collection'} = $collect;
310
311 return bless $self, $class;
312}
313sub get_collection
314{
315 my $self = shift @_;
316
317 return $self->{'collection'};
318}
319
320
321sub read_collection_cfg
322{
323 my $self = shift @_;
324 my ($collection,$options) = @_;
325
326 my $collectdir = $self->{'collectdir'};
327 my $site = $self->{'site'};
328 my $out = $self->{'out'};
329
330 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
331 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
332 die "\n";
333 }
334
335 # set gs_version 2/3
336 $self->{'gs_version'} = "2";
337 if ((defined $site) && ($site ne "")) {
338 # gs3
339 $self->{'gs_version'} = "3";
340 }
341
342 # add collection's perllib dir into include path in
343 # case we have collection specific modules
344 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
345
346 # check that we can open the faillog
347 my $faillog = $self->{'faillog'};
348 if ($faillog eq "") {
349 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
350 }
351 open (FAILLOG, ">$faillog") ||
352 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
353
354
355 my $faillogname = $faillog;
356 $faillog = 'inexport::FAILLOG';
357 $faillog->autoflush(1);
358 $self->{'faillog'} = $faillog;
359 $self->{'faillogname'} = $faillogname;
360 $self->{'close_faillog'} = 1;
361
362 # Read in the collection configuration file.
363 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
364 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
365
366 # store the config file's name, so oaiinfo object constructor can be instantiated with it
367 $self->{'config_filename'} = $config_filename;
368
369 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
370
371 return ($config_filename,$collectcfg);
372}
373
374sub set_collection_options
375{
376 my $self = shift @_;
377 my ($collectcfg) = @_;
378
379 my $inexport_mode = $self->{'mode'};
380
381 my $importdir = $self->{'importdir'};
382 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
383 my $out = $self->{'out'};
384
385 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
386 if (!defined($collectcfg->{'infodbtype'}))
387 {
388 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
389 }
390 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
391 # we can't use the text version for archives dbs.
392 $collectcfg->{'infodbtype'} = "gdbm";
393 }
394
395 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
396 $importdir = $collectcfg->{'importdir'};
397 }
398
399 if ($inexport_mode eq "import") {
400 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
401 $archivedir = $collectcfg->{'archivedir'};
402 }
403 }
404 elsif ($inexport_mode eq "export") {
405 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
406 $archivedir = $collectcfg->{'exportdir'};
407 }
408 }
409 # fill in the default import and archives directories if none
410 # were supplied, turn all \ into / and remove trailing /
411 if (!&FileUtils::isFilenameAbsolute($importdir))
412 {
413 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
414 }
415 else
416 {
417 # Don't do this - it kills protocol prefixes
418 #$importdir =~ s/[\\\/]+/\//g;
419 #$importdir =~ s/\/$//;
420 # Do this instead
421 &FileUtils::sanitizePath($importdir);
422 }
423
424 if (!&FileUtils::directoryExists($importdir))
425 {
426 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
427 die "\n";
428 }
429 $self->{'importdir'} = $importdir;
430
431 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
432 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
433 }
434 else {
435
436 $archivedir = &FileUtils::sanitizePath($archivedir);
437 }
438
439 my $archivedir_keepold = "${archivedir}_keepold"; # used when file-level document-version history is in play
440 $self->{'archivedir'} = $archivedir;
441 $self->{'archivedir_keepold'} = $archivedir_keepold;
442
443 if (defined $self->{'default_verbosity'}) {
444 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
445 $self->{'verbosity'} = $collectcfg->{'verbosity'};
446 }
447 }
448
449 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
450 $self->{'manifest'} = $collectcfg->{'manifest'};
451 }
452
453 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
454 if ($collectcfg->{'gzip'} =~ /^true$/i) {
455 $self->{'gzip'} = 1;
456 }
457 }
458
459 if (defined $self->{'default_maxdocs'}) {
460 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
461 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
462 }
463 }
464
465
466
467 if (defined $self->{'default_OIDtype'} ) {
468 if (defined $collectcfg->{'OIDtype'}
469 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
470 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
471 }
472 }
473
474 if (defined $self->{'default_OIDmetadata'}) {
475 if (defined $collectcfg->{'OIDmetadata'}) {
476 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
477 }
478 }
479
480 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
481 $self->{'debug'} = 1;
482 }
483 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
484 $self->{'gli'} = 1;
485 }
486 $self->{'gli'} = 0 unless defined $self->{'gli'};
487
488 # check keepold and removeold
489 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
490
491 my ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
492 = &scriptutil::check_removeold_keepold_replaceold($self->{'removeold'}, $self->{'keepold'}, $self->{'replaceold'},
493 $self->{'incremental'}, $checkdir,
494 $collectcfg);
495
496 $self->{'removeold'} = $removeold;
497 $self->{'keepold'} = $keepold;
498 $self->{'replaceold'} = $replaceold;
499 $self->{'incremental'} = $incremental;
500 $self->{'incremental_mode'} = $incremental_mode;
501
502 # Since this wasted my morning, let's at least warn a user that manifest
503 # files now *only* work if keepold is set [jmt12]
504 if ($self->{'manifest'} && (!$keepold || !$incremental))
505 {
506 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
507 }
508}
509
510sub process_files
511{
512 my $self = shift @_;
513 my ($config_filename,$collectcfg) = @_;
514
515 my $inexport_mode = $self->{'mode'};
516
517 my $verbosity = $self->{'verbosity'};
518 my $debug = $self->{'debug'};
519
520 my $importdir = $self->{'importdir'};
521 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
522 # 'archivedir' is a tad abused, and is sometimes set to the 'exportdir' value,
523 # meaining 'archivedir_keepold' is actually the export dir name with '_keepold' appended
524 my $archivedir_keepold = $self->{'archivedir_keepold'};
525
526 my $incremental = $self->{'incremental'};
527 my $incremental_mode = $self->{'incremental_mode'};
528
529 my $gs_version = $self->{'gs_version'};
530
531 my $removeold = $self->{'removeold'};
532 my $replaceold = $self->{'replaceold'};
533 my $keepold = $self->{'keepold'};
534
535 my $saveas = $self->{'saveas'};
536 my $saveas_options = $self->{'saveas_options'};
537 my $OIDtype = $self->{'OIDtype'};
538 my $OIDmetadata = $self->{'OIDmetadata'};
539
540 my $out = $self->{'out'};
541 my $faillog = $self->{'faillog'};
542
543 my $maxdocs = $self->{'maxdocs'};
544 my $gzip = $self->{'gzip'};
545 my $groupsize = $self->{'groupsize'};
546 my $sortmeta = $self->{'sortmeta'};
547
548 my $removeprefix = $self->{'removeprefix'};
549 my $removesuffix = $self->{'removesuffix'};
550
551 my $gli = $self->{'gli'};
552
553 # related to export
554 my $xsltfile = $self->{'xsltfile'};
555 my $group_marc = $self->{'group_marc'};
556 my $mapping_file = $self->{'mapping_file'};
557 my $xslt_mets = $self->{'xslt_mets'};
558 my $xslt_txt = $self->{'xslt_txt'};
559 my $fedora_namespace = $self->{'fedora_namespace'};
560 my $metadata_prefix = $self->{'metadata_prefix'};
561
562 if ($inexport_mode eq "import") {
563 print STDERR "<Import>\n" if $gli;
564 }
565 else {
566 print STDERR "<export>\n" if $gli;
567 }
568
569 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
570 if ($self->{'manifest'} ne "") {
571 my $manifest_filename = $self->{'manifest'};
572
573 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
574 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
575 }
576 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
577 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
578 #$self->{'manifest'} =~ s/\/$//;
579
580 $manifest_lookup->parse($manifest_filename);
581
582 # manifests may now include a version number [jmt12]
583 $self->{'manifest_version'} = $manifest_lookup->get_version();
584 }
585
586 my $manifest = $self->{'manifest'};
587
588 # load all the plugins
589 my $plugins = [];
590 if (defined $collectcfg->{'plugin'}) {
591 $plugins = $collectcfg->{'plugin'};
592 }
593
594 my $plugin_incr_mode = $incremental_mode;
595 if ($manifest ne "") {
596 # if we have a manifest file, then we pretend we are fully incremental for plugins
597 $plugin_incr_mode = "all";
598 }
599 #some global options for the plugins
600 my @global_opts = ();
601
602 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
603 if (scalar(@$pluginfo) == 0) {
604 &gsprintf($out, "{import.no_plugins_loaded}\n");
605 die "\n";
606 }
607
608 # Whether -removeold, -keepold or -replaceold there should never be an existing archivedir_keepold
609 # => Taken to be a sign of a previous import/export that has gone wrong
610 # => Print out error message and stop!
611
612 if (&FileUtils::directoryExists($archivedir_keepold)) {
613 my $rkr_old_minus_option = undef; # rkr = remove, keep, replace (whichever one is being used)
614 if ($removeold) {
615 $rkr_old_minus_option = "-removeold";
616 }
617 elsif ($keepold) {
618 $rkr_old_minus_option = "-keepold";
619 }
620 elsif ($replaceold) {
621 $rkr_old_minus_option = "-replaceold";
622 }
623
624 &gsprintf(STDERR, "\n");
625 &gsprintf(STDERR, "Detected existing directory:\n\n");
626 &gsprintf(STDERR, " $archivedir_keepold\n\n");
627 &gsprintf(STDERR, "Stopping $inexport_mode.\n\n");
628
629 &gsprintf(STDERR, "**** When building with $rkr_old_minus_option, there cannot be a pre-existing 'archives_keepold' directory\n");
630 &gsprintf(STDERR, "****\n");
631 &gsprintf(STDERR, "**** Review your collection directory folder, and determine whether to:\n");
632 &gsprintf(STDERR, "**** (a) move your 'archives_keepold' back to being 'archives'; or\n");
633 &gsprintf(STDERR, "**** (b) remove your 'archives_keepold'\n");
634 &gsprintf(STDERR, "**** before running your $inexport_mode command again\n\n");
635
636 exit 1; # c errno for 'operation not permitted'
637 }
638
639
640 # remove the old contents of the archives directory (and tmp directory) if needed
641
642 if ($removeold) {
643 if (&FileUtils::directoryExists($archivedir)) {
644 &gsprintf($out, "{import.removing_archives}\n");
645 &FileUtils::removeFilesRecursive($archivedir);
646 }
647 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
648 $tmpdir =~ s/[\\\/]+/\//g;
649 $tmpdir =~ s/\/$//;
650 if (&FileUtils::directoryExists($tmpdir)) {
651 &gsprintf($out, "{import.removing_tmpdir}\n");
652 &FileUtils::removeFilesRecursive($tmpdir);
653 }
654 }
655 else {
656 # If not $removeold, then must be $keepold or $replaceold
657 # => for either case want to "hard-link"/copy 'archives' to 'archives_keepold'
658
659 # Want to be super careful about doing this, so as not to accidentally
660 # wipe out any previous file-level document-version history
661
662 # If got to here, then there is no pre-existing $archivedir_keepold
663 # => Hard-link copy the contents of 'archives' to 'archives_keepold'
664 # => Stop if there is any issue with creating the hard-link copy
665
666 if (!&FileUtils::hardlinkFilesRefRecursive([$archivedir],$archivedir_keepold, { 'strict' => 1 } )) {
667
668 &gsprintf(STDERR, "\nError message: $!\n\n");
669
670 &gsprintf(STDERR, "**** Failed to make a hard-link copy of:\n");
671 &gsprintf(STDERR, "**** $archivedir\n");
672 &gsprintf(STDERR, "**** to:\n");
673 &gsprintf(STDERR, "**** $archivedir_keepold\n");
674 &gsprintf(STDERR, "****\n");
675 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
676
677 exit $!;
678 }
679 }
680
681 # create the archives dir if needed
682 &FileUtils::makeAllDirectories($archivedir);
683
684 # read the archive information file
685
686 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
687 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
688 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
689
690 # When we make these initial calls to determine the archive information doc
691 # and src databases we pass through a '1' to indicate this is the first
692 # time we are referring to these databases. When using dynamic dbutils
693 # (available in extensions) this indicates to some database types (for
694 # example, persistent servers) that this is a good time to perform any
695 # one time initialization. The argument has no effect on vanilla dbutils
696 # [jmt12]
697 my $perform_firsttime_init = 1;
698 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
699 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
700
701
702 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
703 $archive_info->load_info($arcinfo_doc_filename);
704 # Load in reverse-lookup info (used to determine the docs that a file in import are used in),
705 # so we don't overwrite existing info when we do incremental import
706 # From here on, make all changes to this object, then write out the file at the end.
707 $archive_info->load_rev_info($arcinfo_src_filename);
708
709 if ($manifest eq "") {
710 # Load in list of files in import folder from last import (if present)
711 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
712 }
713
714 ####Use Plugout####
715 my $plugout;
716
717 my $generate_auxiliary_files = 0;
718 if ($inexport_mode eq "import") {
719 $generate_auxiliary_files = 1;
720 }
721 elsif ($self->{'include_auxiliary_database_files'}) {
722 $generate_auxiliary_files = 1;
723 }
724 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
725
726 # Option to use user defined plugout
727 if ($inexport_mode eq "import") {
728 if (defined $collectcfg->{'plugout'}) {
729 # If a plugout was specified in the collect.cfg file, assume it is sensible
730 # We can't check the name because it could be anything, if it is a custom plugout
731 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
732 $plugout = $collectcfg->{'plugout'};
733 }
734 else {
735 push @$plugout,$saveas."Plugout";
736 }
737
738 }
739 else {
740 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
741 $plugout = $collectcfg->{'plugout'};
742 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
743 }
744 else {
745 push @$plugout,$saveas."Plugout";
746 }
747 }
748
749 my $plugout_name = $plugout->[0];
750
751 if (defined $saveas_options) {
752 my @user_plugout_options = split(" ", $saveas_options);
753 push @$plugout, @user_plugout_options;
754 }
755 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
756 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
757 push @$plugout,("-debug") if ($debug);
758 push @$plugout,("-gzip_output") if ($gzip);
759 push @$plugout,("-output_handle",$out) if (defined $out);
760 push @$plugout,("-site",$self->{'site'}) if (defined $self->{'site'});
761
762 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
763 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
764 if ($inexport_mode eq "import") {
765 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
766 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
767 }
768 }
769 my $processor = &plugout::load_plugout($plugout);
770 $processor->setoutputdir ($archivedir);
771 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
772 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
773 $processor->begin();
774 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
775
776 if ($removeold) {
777 # occasionally, plugins may want to do something on remove
778 # old, eg pharos image indexing
779 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
780 }
781
782 # process the import directory
783 my $block_hash = {};
784 $block_hash->{'new_files'} = {};
785 $block_hash->{'reindex_files'} = {};
786
787 # all of these are set somewhere else, so it's more readable to define them here [jmt12]
788 $block_hash->{'all_files'} = {};
789 $block_hash->{'deleted_files'} = {};
790 $block_hash->{'file_blocks'} = {};
791 $block_hash->{'metadata_files'} = {};
792 $block_hash->{'shared_fileroot'} = '';
793 $block_hash->{'manifest'} = 'false';
794 my $metadata = {};
795
796 # global blocking pass may set up some metadata
797 # does this set up metadata?????
798 # - when we have a newer manifest file we don't do this -unless- the
799 # collection configuration indicates this collection contains complex
800 # (inherited) metadata [jmt12]
801 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
802 {
803 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
804 }
805 else
806 {
807 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
808 }
809
810
811 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
812 # of the OAI identifiers with their time stamps and deleted status.
813 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
814 my $have_manifest = ($manifest eq '') ? 0 : 1;
815 $oai_info->import_stage($removeold, $have_manifest);
816
817
818 if ($manifest ne "") {
819
820 # mark that we are using a manifest - information that might be needed
821 # down in plugins (for instance DirectoryPlugin)
822 $block_hash->{'manifest'} = $self->{'manifest_version'};
823
824 #
825 # 1. Process delete files first
826 #
827 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
828 my @full_deleted_files = ();
829
830 # ensure all filenames are absolute
831 foreach my $df (@deleted_files) {
832 my $full_df =
833 (&FileUtils::isFilenameAbsolute($df))
834 ? $df
835 : &FileUtils::filenameConcatenate($importdir,$df);
836
837 # gdb doesn't store short filenames, so ensure we specify full filenames for deletion
838 $full_df = &util::upgrade_if_dos_filename($full_df); # will only do something on windows
839
840 if (-d $full_df) {
841 &add_dir_contents_to_list($full_df, \@full_deleted_files);
842 } else {
843 push(@full_deleted_files,$full_df);
844 }
845 }
846
847 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
848 mark_docs_for_deletion($archive_info,{},
849 \@full_deleted_files,
850 $archivedir, $verbosity, "delete");
851
852
853 #
854 # 2. Now files for reindexing
855 #
856
857 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
858 my @full_reindex_files = ();
859 # ensure all filenames are absolute
860 foreach my $rf (@reindex_files) {
861 my $full_rf =
862 (&FileUtils::isFilenameAbsolute($rf))
863 ? $rf
864 : &FileUtils::filenameConcatenate($importdir,$rf);
865
866 if (-d $full_rf) {
867 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
868 } else {
869 push(@full_reindex_files,$full_rf);
870 }
871 }
872
873 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
874 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
875
876 # And now to ensure the new version of the file processed by
877 # appropriate plugin, we need to add it to block_hash reindex list
878 foreach my $full_rf (@full_reindex_files) {
879 $block_hash->{'reindex_files'}->{$full_rf} = 1;
880 }
881
882
883 #
884 # 3. Now finally any new files - add to block_hash new_files list
885 #
886
887 my @new_files = keys %{$manifest_lookup->{'index'}};
888 my @full_new_files = ();
889
890 foreach my $nf (@new_files) {
891 # ensure filename is absolute
892 my $full_nf =
893 (&FileUtils::isFilenameAbsolute($nf))
894 ? $nf
895 : &FileUtils::filenameConcatenate($importdir,$nf);
896
897 if (-d $full_nf) {
898 &add_dir_contents_to_list($full_nf, \@full_new_files);
899 } else {
900 push(@full_new_files,$full_nf);
901 }
902 }
903
904 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
905
906 # need to check this file exists before trying to read it - in the past
907 # it wasn't possible to have a manifest unless keepold was also set so
908 # you were pretty much guaranteed arcinfo existed
909 # [jmt12]
910 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
911 if (-e $arcinfo_src_filename)
912 {
913 my $arcinfodb_map = {};
914 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
915 foreach my $f (@full_new_files) {
916 my $rel_f = &util::abspath_to_placeholders($f);
917
918 # check that we haven't seen it already
919 if (defined $arcinfodb_map->{$rel_f}) {
920 # TODO make better warning
921 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
922 } else {
923 $block_hash->{'new_files'}->{$f} = 1;
924 }
925 }
926
927 undef $arcinfodb_map;
928 }
929 # no existing files - so we can just add all the files [jmt12]
930 else
931 {
932 foreach my $f (@full_new_files)
933 {
934 $block_hash->{'new_files'}->{$f} = 1;
935 }
936 }
937
938 # If we are not using complex inherited metadata (and thus have skipped
939 # the global file scan) we need to at least check for a matching
940 # metadata.xml for the files being indexed/reindexed
941 # - unless we are using the newer version of Manifests, which are treated
942 # verbatim, and should have a metadata element for metadata files (so
943 # we can explicitly process metadata files other than metadata.xml)
944 # [jmt12]
945 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
946 {
947 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
948 foreach my $file_to_import (@all_files_to_import)
949 {
950 my $metadata_xml_path = $file_to_import;
951 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
952 if (&FileUtils::fileExists($metadata_xml_path))
953 {
954 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
955 }
956 }
957 }
958
959 # new version manifest files explicitly list metadata files to be
960 # processed (ignoring complexmeta if set)
961 # [jmt12]
962 if ($self->{'manifest_version'} > 1)
963 {
964 # Process metadata files
965 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
966 {
967 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
968 }
969 }
970 } # end if (manifest ne "")
971 else {
972 # if incremental, we read through the import folder to see whats changed.
973
974 if ($incremental || $incremental_mode eq "onlyadd") {
975 prime_doc_oid_count($archivedir);
976
977 # Can now work out which files were new, already existed, and have
978 # been deleted
979
980 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
981 $archivedir,$verbosity,$incremental_mode);
982
983 my @new_files = sort keys %{$block_hash->{'new_files'}};
984 if (scalar(@new_files>0)) {
985 print STDERR "New files and modified metadata files since last import:\n ";
986 print STDERR join("\n ",@new_files), "\n";
987 }
988
989 if ($incremental) {
990 # only look for deletions if we are truely incremental
991 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
992 # Filter out any in gsdl/tmp area
993 my @filtered_deleted_files = ();
994 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
995 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
996 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
997 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
998
999 foreach my $df (@deleted_files) {
1000 next if ($df =~ m/^$gsdl_tmp_area/);
1001 next if ($df =~ m/^$collect_tmp_area/);
1002
1003 push(@filtered_deleted_files,$df);
1004 }
1005
1006
1007 @deleted_files = @filtered_deleted_files;
1008
1009 if (scalar(@deleted_files)>0) {
1010 print STDERR "Files deleted since last import:\n ";
1011 print STDERR join("\n ",@deleted_files), "\n";
1012
1013
1014 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
1015
1016 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
1017 }
1018
1019 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
1020
1021 if (scalar(@reindex_files)>0) {
1022 print STDERR "Files to reindex since last import:\n ";
1023 print STDERR join("\n ",@reindex_files), "\n";
1024 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
1025 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
1026 }
1027
1028 }
1029 } # end if incremental/only_add mode
1030 # else no manifest AND not incremental
1031 } # end if else block of manifest ne "" else eq ""
1032
1033 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
1034 # Do nothing if the file already exists (file exists on incremental build).
1035 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
1036 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
1037 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
1038 # oailastmodified and oailastmodifieddate
1039 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
1040 if ($self->{'generate_auxiliary_files'}) {
1041 if (!-f $earliestDatestampFile && -d $archivedir) {
1042 my $current_time_in_seconds = time; # in seconds
1043
1044 if(open(FOUT, ">$earliestDatestampFile")) {
1045 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
1046 print FOUT $current_time_in_seconds;
1047 close(FOUT);
1048 }
1049 else {
1050 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
1051 }
1052
1053 }
1054 }
1055
1056 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
1057
1058 if ($saveas eq "FedoraMETS") {
1059 # create collection "doc obj" for Fedora that contains
1060 # collection-level metadata
1061
1062 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1063 $doc_obj->set_OID("collection");
1064
1065 my $col_name = undef;
1066 my $col_meta = $collectcfg->{'collectionmeta'};
1067
1068 if (defined $col_meta) {
1069 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1070 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
1071 }
1072 $processor->process($doc_obj);
1073 }
1074
1075 &plugin::end($pluginfo, $processor);
1076
1077 &plugin::deinit($pluginfo, $processor);
1078
1079 # Store the value of OIDCount (used in doc.pm) so it can be
1080 # restored correctly to this value on an incremental build
1081 # - this OIDcount file should only be generated for numerical oids [jmt12]
1082 if ($self->{'OIDtype'} eq 'incremental')
1083 {
1084 store_doc_oid_count($archivedir);
1085 }
1086
1087 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1088 $processor->close_group_output() if $processor->is_group();
1089 $processor->end();
1090
1091 if ($self->{'generate_auxiliary_files'}) {
1092
1093 # write out the archive information file
1094 # for backwards compatability with archvies.inf file
1095 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1096 # In the days of this being a text file, this all we had to do
1097 # Note, if still using this form of archive-inf, then neither
1098 # incremental building nor files-level document-version history
1099 # is suported
1100 $archive_info->save_info($arcinfo_doc_filename);
1101 }
1102 else {
1103 $archive_info->save_revinfo_db($arcinfo_src_filename);
1104 }
1105 }
1106
1107
1108 #
1109 # Now deal with any file-level document-version history (fldv-history)
1110 #
1111
1112 if ($keepold || $replaceold) {
1113
1114 &DocHistoryFileUtils::archivedir_keepold_to_archivedir($collectcfg, $keepold, $replaceold, $incremental_mode, $archive_info, $archivedir,$archivedir_keepold);
1115
1116 }
1117
1118
1119 return $pluginfo;
1120}
1121
1122# @function perform_process_files()
1123# while process_files() above prepares the system to import files this is the
1124# function that actually initiates the plugin pipeline to process the files.
1125# This function should therefore be overridden in subclasses of inexport.pm should
1126# they wish to do different or further processing
1127# @author jmt12
1128sub perform_process_files
1129{
1130 my $self = shift(@_);
1131 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1132 my $gli = $self->{'gli'};
1133 # specific file to process - via manifest version 2+
1134 if ($file_to_import ne '')
1135 {
1136 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1137 }
1138 # global file scan - if we are using a new version manifest, files would have
1139 # been read above. Older manifests use extra settings in the $block_hash to
1140 # control what is imported, while non-manifest imports use a regular
1141 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1142 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1143 {
1144 #print STDERR "**** perform_process_files(): importdir=$importdir\n";
1145 #print STDERR "**** block_hash:\n ", join("\n ", keys %{$block_hash}), "\n\n";
1146 #print STDERR "**** block_hash->all_files:\n ", join("\n ", keys %{$block_hash->{'all_files'}}), "\n\n";
1147 #print STDERR "**** block_hash->reindex_files:\n ", join("\n ", keys %{$block_hash->{'reindex_files'}}), "\n\n";
1148
1149 #print STDERR "**** block_hash->existing_files:\n ", join("\n ", keys %{$block_hash->{'existing_files'}}), "\n\n";
1150 #print STDERR "**** block_hash->file_blocks:\n ", join("\n ", keys %{$block_hash->{'file_blocks'}}), "\n\n";
1151
1152 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1153 }
1154 else
1155 {
1156 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1157 }
1158}
1159# perform_process_files()
1160
1161# @function generate_statistics()
1162sub generate_statistics
1163{
1164 my $self = shift @_;
1165 my ($pluginfo) = @_;
1166
1167 my $inexport_mode = $self->{'mode'};
1168 my $out = $self->{'out'};
1169 my $faillogname = $self->{'faillogname'};
1170 my $statsfile = $self->{'statsfile'};
1171 my $gli = $self->{'gli'};
1172
1173 &gsprintf($out, "\n");
1174 &gsprintf($out, "*********************************************\n");
1175 &gsprintf($out, "{$inexport_mode.complete}\n");
1176 &gsprintf($out, "*********************************************\n");
1177
1178 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1179}
1180# generate_statistics()
1181
1182
1183# @function deinit()
1184# Close down any file handles that we opened (and hence are responsible for
1185# closing
1186sub deinit
1187{
1188 my $self = shift(@_);
1189 close OUT if $self->{'close_out'};
1190 close FAILLOG if $self->{'close_faillog'};
1191 close STATSFILE if $self->{'close_statsfile'};
1192}
1193# deinit()
1194
1195
1196sub store_collectionmeta
1197{
1198 my ($collectionmeta,$field,$doc_obj) = @_;
1199
1200 my $section = $doc_obj->get_top_section();
1201
1202 my $field_hash = $collectionmeta->{$field};
1203
1204 foreach my $k (keys %$field_hash)
1205 {
1206 my $val = $field_hash->{$k};
1207
1208 ### print STDERR "*** $k = $field_hash->{$k}\n";
1209
1210 my $md_label = "ex.$field";
1211
1212
1213 if ($k =~ m/^\[l=(.*?)\]$/)
1214 {
1215
1216 my $md_suffix = $1;
1217 $md_label .= "^$md_suffix";
1218 }
1219
1220
1221 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1222
1223 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1224 # while "collectionname" in GS2 is called "name" in GS3.
1225 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1226 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1227 {
1228 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1229 }
1230
1231 }
1232}
1233
1234
1235sub oid_count_file {
1236 my ($archivedir) = @_;
1237 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1238}
1239
1240
1241sub prime_doc_oid_count
1242{
1243 my ($archivedir) = @_;
1244 my $oid_count_filename = &oid_count_file($archivedir);
1245
1246 if (-e $oid_count_filename) {
1247 if (open(OIDIN,"<$oid_count_filename")) {
1248 my $OIDcount = <OIDIN>;
1249 chomp $OIDcount;
1250 close(OIDIN);
1251
1252 $doc::OIDcount = $OIDcount;
1253 }
1254 else {
1255 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1256 }
1257 }
1258
1259}
1260
1261sub store_doc_oid_count
1262{
1263 # Use the file "OIDcount" in the archives directory to record
1264 # what value doc.pm got up to
1265
1266 my ($archivedir) = @_;
1267 my $oid_count_filename = &oid_count_file($archivedir);
1268
1269 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1270 if (open(OIDOUT,">$oid_count_filename")) {
1271 print OIDOUT $doc::OIDcount, "\n";
1272
1273 close(OIDOUT);
1274 }
1275 else {
1276 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1277 }
1278}
1279
1280
1281
1282sub new_vs_old_import_diff
1283{
1284 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1285
1286 # Get the infodbtype value for this collection from the arcinfo object
1287 my $infodbtype = $archive_info->{'infodbtype'};
1288
1289 # in this method, we want to know if metadata files are modified or not.
1290 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1291
1292 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1293
1294 # First convert all files to absolute form
1295 # This is to support the situation where the import folder is not
1296 # the default
1297
1298 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1299
1300 my $full_prev_all_files = {};
1301
1302 foreach my $prev_file (keys %$prev_all_files) {
1303 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions.
1304 $prev_file = &util::upgrade_if_dos_filename($prev_file);
1305
1306 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1307 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1308 $full_prev_all_files->{$full_prev_file} = $prev_file;
1309 }
1310 else {
1311 $full_prev_all_files->{$prev_file} = $prev_file;
1312 }
1313 }
1314
1315
1316 # Figure out which are the new files, existing files and so
1317 # by implication the files from the previous import that are not
1318 # there any more => mark them for deletion
1319 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1320
1321 my $full_curr_file = $curr_file;
1322
1323 # entry in 'all_files' is moved to either 'existing_files',
1324 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1325
1326 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1327 # add in import dir to make absolute
1328 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1329 }
1330
1331 ###print STDERR "*** new vs old: look to see if full_curr_file=$full_curr_file in full_prev_all_files hashmap\n";
1332
1333 # figure out if new file or not
1334 if (defined $full_prev_all_files->{$full_curr_file}) {
1335 # delete it so that only files that need deleting are left
1336 delete $full_prev_all_files->{$full_curr_file};
1337 # had it before. is it a metadata file?
1338 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1339 # is it modified??
1340 if (-M $full_curr_file < $archiveinf_timestamp) {
1341 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1342 # its newer than last build
1343 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1344 }
1345 }
1346 else {
1347 if ($incremental_mode eq "all") {
1348
1349 # had it before
1350 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1351
1352 }
1353 else {
1354 # Warning in "onlyadd" mode, but had it before!
1355 print STDERR "Warning: File $full_curr_file previously imported.\n";
1356 print STDERR " Treating as new file\n";
1357
1358 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1359
1360 }
1361 }
1362 }
1363 else {
1364 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1365 # the new file is the special sort of file greenstone uses
1366 # to attach metadata to src documents
1367 # i.e metadata.xml
1368 # (but note, the filename used is not constrained in
1369 # Greenstone to always be this)
1370
1371 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1372 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1373 }
1374 else {
1375 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1376 }
1377 }
1378
1379
1380 delete $block_hash->{'all_files'}->{$curr_file};
1381 }
1382
1383
1384
1385
1386 # Deal with complication of new or modified metadata files by forcing
1387 # everything from this point down in the file hierarchy to
1388 # be freshly imported.
1389 #
1390 # This may mean files that have not changed are reindexed, but does
1391 # guarantee by the end of processing all new metadata is correctly
1392 # associated with the relevant document(s).
1393
1394 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1395 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1396
1397 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1398 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1399
1400 # Go through existing_files, and mark anything that is contained
1401 # within 'situated_dir' to be reindexed (in case some of the metadata
1402 # attaches to one of these files)
1403
1404 my $reindex_files = [];
1405
1406 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1407
1408 if ($existing_f =~ m/^$situated_dir/) {
1409
1410 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1411
1412 push(@$reindex_files,$existing_f);
1413 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1414 delete $block_hash->{'existing_files'}->{$existing_f};
1415
1416 }
1417 }
1418
1419 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1420 # (or equivalent)
1421 $block_hash->{'new_files'}->{$new_mdf} = 1;
1422
1423 }
1424
1425 # go through remaining existing files and work out what has changed and needs to be reindexed.
1426 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1427
1428 my $reindex_files = [];
1429
1430 foreach my $existing_filename (@existing_files) {
1431 if (-M $existing_filename < $archiveinf_timestamp) {
1432 # file is newer than last build
1433
1434 my $existing_file = $existing_filename;
1435 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1436
1437 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1438 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1439
1440 # print STDERR "**** Reindexing existing file: $existing_file\n";
1441
1442 push(@$reindex_files,$existing_file);
1443 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1444 }
1445
1446 }
1447
1448
1449 # By this point full_prev_all_files contains the files
1450 # mentioned in archiveinf-src.db but are not in the 'import'
1451 # folder (or whatever was specified through -importdir ...)
1452
1453 # This list can contain files that were created in the 'tmp' or
1454 # 'cache' areas (such as screen-size and thumbnail images).
1455 #
1456 # In building the final list of files to delete, we test to see if
1457 # it exists on the filesystem and if it does (unusual for a "normal"
1458 # file in import, but possible in the case of 'tmp' files),
1459 # supress it from going into the final list
1460
1461 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1462
1463 my @deleted_files = values %$full_prev_all_files;
1464 map { my $curr_file = $_;
1465 my $full_curr_file = $curr_file;
1466
1467 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1468 # add in import dir to make absolute
1469
1470 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1471 }
1472
1473
1474 if (!-e $full_curr_file) {
1475 $curr_file = &util::upgrade_if_dos_filename($curr_file);
1476 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1477 }
1478 } @deleted_files;
1479
1480
1481
1482}
1483
1484
1485# this is used to delete "deleted" docs and to remove old versions of "changed" docs
1486# $mode is 'delete' or 'reindex'
1487sub mark_docs_for_deletion
1488{
1489 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1490
1491 my $mode_text = "deleted from index";
1492 if ($mode eq "reindex") {
1493 $mode_text = "reindexed";
1494 }
1495
1496 # Get the infodbtype value for this collection from the arcinfo object
1497 my $infodbtype = $archive_info->{'infodbtype'};
1498
1499 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1500 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1501
1502
1503 # record files marked for deletion in arcinfo
1504 foreach my $file (@$deleted_files) {
1505 # use 'archiveinf-src' info database file to look up all the OIDs
1506 # that this file is used in (note in most cases, it's just one OID)
1507
1508 my $downgraded_file = &util::downgrade_if_dos_filename($file);
1509 my $oids = $archive_info->get_reverseinfo($downgraded_file);
1510 $archive_info->remove_reverseinfo($downgraded_file);
1511
1512 foreach my $oid (@$oids) {
1513 # get the record for this OID from doc db
1514 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1515 # find the source doc (the primary file that becomes this oid)
1516 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1517 $doc_source_file = &util::placeholders_to_abspath($doc_source_file, "long");
1518
1519 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1520 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1521 }
1522
1523 if ($doc_source_file ne $file) {
1524 # its an associated or metadata file
1525 # mark source doc for reimport as one of its assoc files has changed or deleted
1526 #$doc_source_file = &util::upgrade_if_dos_filename($doc_source_file);
1527 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1528
1529 } else {
1530
1531 # the file to be deleted/reindexed is a primary file. We need to remove all references to this in the src db
1532 my $assoc_files = $doc_rec->{'assoc-file'};
1533 foreach my $assocfile (@$assoc_files) {
1534 $assocfile = &util::placeholders_to_abspath($assocfile);
1535 $archive_info->remove_reverseinfo($assocfile, $oid);
1536 if (!defined $archive_info->get_reverseinfo($assocfile)) {
1537 # nothing refers to it anymore, mark for reindex.
1538 # block hash needs full filenames
1539 $assocfile = &util::upgrade_if_dos_filename($assocfile);
1540 $block_hash->{'reindex_files'}->{$assocfile} = 1;
1541 }
1542 }
1543
1544 }
1545 my $curr_status = $archive_info->get_status_info($oid);
1546 if (defined($curr_status) && (($curr_status ne "D"))) {
1547 if ($verbosity>1) {
1548 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1549 }
1550 # mark oid for deletion (it will be deleted or reimported)
1551 $archive_info->set_status_info($oid,"D");
1552 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1553 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1554
1555 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1556 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1557
1558 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1559 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1560 }
1561 }
1562
1563 }
1564
1565 # now go through and check that we haven't marked any primary
1566 # files for reindex (because their associated files have
1567 # changed/deleted) when they have been deleted themselves. only in
1568 # delete mode.
1569
1570 if ($mode eq "delete") {
1571 foreach my $file (@$deleted_files) {
1572 if (defined $block_hash->{'reindex_files'}->{$file}) {
1573 delete $block_hash->{'reindex_files'}->{$file};
1574 }
1575 }
1576 }
1577
1578
1579}
1580
1581sub add_dir_contents_to_list {
1582
1583 my ($dirname, $list) = @_;
1584
1585 # Recur over directory contents.
1586 my (@dir, $subfile);
1587
1588 # find all the files in the directory
1589 if (!opendir (DIR, $dirname)) {
1590 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1591 return -1; # error in processing
1592 }
1593 @dir = readdir (DIR);
1594 closedir (DIR);
1595
1596 for (my $i = 0; $i < scalar(@dir); $i++) {
1597 my $subfile = $dir[$i];
1598 next if ($subfile =~ m/^\.\.?$/);
1599 next if ($subfile =~ /^\.svn$/);
1600 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1601 if (-d $full_file) {
1602 &add_dir_contents_to_list($full_file, $list);
1603 } else {
1604 push (@$list, $full_file);
1605 }
1606 }
1607
1608}
1609
1610
16111;
Note: See TracBrowser for help on using the repository browser.