source: main/trunk/greenstone2/perllib/inexport.pm@ 29176

Last change on this file since 29176 was 29096, checked in by kjdon, 10 years ago

new argument to print_txt_usage. Pass 1 if you don't want the output paged. We use this when there has been an error and we are outputing the options before quitting the import/build. If the output is paged, then the die doesn't end up getting through to the top level program. So for full-rebuild, if the import died because of a parsing error, if the output had been paged, then the import was stopped but the system return value was 0, and then it would go on to the next stage, trying to build. So now, if we are stopping because of an error, then don't page the output. Also added a few more (hopefully) helpful error messages

  • Property svn:executable set to *
File size: 46.9 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50my $oidtype_list =
51 [ { 'name' => "hash",
52 'desc' => "{import.OIDtype.hash}" },
53 { 'name' => "hash_on_full_filename",
54 'desc' => "{import.OIDtype.hash_on_full_filename}" },
55 { 'name' => "assigned",
56 'desc' => "{import.OIDtype.assigned}" },
57 { 'name' => "incremental",
58 'desc' => "{import.OIDtype.incremental}" },
59 { 'name' => "filename",
60 'desc' => "{import.OIDtype.filename}" },
61 { 'name' => "dirname",
62 'desc' => "{import.OIDtype.dirname}" },
63 { 'name' => "full_filename",
64 'desc' => "{import.OIDtype.full_filename}" } ];
65
66$inexport::directory_arguments =
67[
68 { 'name' => "importdir",
69 'desc' => "{import.importdir}",
70 'type' => "string",
71 'reqd' => "no",
72 'deft' => "import",
73 'hiddengli' => "yes" },
74 { 'name' => "collectdir",
75 'desc' => "{import.collectdir}",
76 'type' => "string",
77 # parsearg left "" as default
78 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
79 'deft' => "",
80 'reqd' => "no",
81 'hiddengli' => "yes" },
82
83];
84$inexport::arguments =
85[
86 # don't set the default to hash - want to allow this to come from
87 # entry in collect.cfg but want to override it here
88 { 'name' => "OIDtype",
89 'desc' => "{import.OIDtype}",
90 'type' => "enum",
91 'list' => $oidtype_list,
92 'deft' => "hash_on_full_filename",
93 'reqd' => "no",
94 'modegli' => "2" },
95 { 'name' => "OIDmetadata",
96 'desc' => "{import.OIDmetadata}",
97 'type' => "string",
98 'deft' => "dc.Identifier",
99 'reqd' => "no",
100 'modegli' => "2" },
101 { 'name' => "site",
102 'desc' => "{import.site}",
103 'type' => "string",
104 'deft' => "",
105 'reqd' => "no",
106 'hiddengli' => "yes" },
107 { 'name' => "manifest",
108 'desc' => "{import.manifest}",
109 'type' => "string",
110 'deft' => "",
111 'reqd' => "no",
112 'hiddengli' => "yes" } ,
113 { 'name' => "incremental",
114 'desc' => "{import.incremental}",
115 'type' => "flag",
116 'hiddengli' => "yes" },
117 { 'name' => "keepold",
118 'desc' => "{import.keepold}",
119 'type' => "flag",
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "removeold",
123 'desc' => "{import.removeold}",
124 'type' => "flag",
125 'reqd' => "no",
126 'hiddengli' => "yes" },
127 { 'name' => "language",
128 'desc' => "{scripts.language}",
129 'type' => "string",
130 'reqd' => "no",
131 'hiddengli' => "yes" },
132 { 'name' => "maxdocs",
133 'desc' => "{import.maxdocs}",
134 'type' => "int",
135 'reqd' => "no",
136 'deft' => "-1",
137 'range' => "-1,",
138 'modegli' => "1" },
139 { 'name' => "debug",
140 'desc' => "{import.debug}",
141 'type' => "flag",
142 'reqd' => "no",
143 'hiddengli' => "yes" },
144 { 'name' => "faillog",
145 'desc' => "{import.faillog}",
146 'type' => "string",
147 # parsearg left "" as default
148 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
149 'deft' => "",
150 'reqd' => "no",
151 'modegli' => "3" },
152 { 'name' => "out",
153 'desc' => "{import.out}",
154 'type' => "string",
155 'deft' => "STDERR",
156 'reqd' => "no",
157 'hiddengli' => "yes" },
158 { 'name' => "statsfile",
159 'desc' => "{import.statsfile}",
160 'type' => "string",
161 'deft' => "STDERR",
162 'reqd' => "no",
163 'hiddengli' => "yes" },
164 { 'name' => "verbosity",
165 'desc' => "{import.verbosity}",
166 'type' => "int",
167 'range' => "0,",
168 'deft' => "2",
169 'reqd' => "no",
170 'modegli' => "3" },
171 { 'name' => "gli",
172 'desc' => "{scripts.gli}",
173 'type' => "flag",
174 'reqd' => "no",
175 'hiddengli' => "yes" },
176 { 'name' => "xml",
177 'desc' => "{scripts.xml}",
178 'type' => "flag",
179 'reqd' => "no",
180 'hiddengli' => "yes" },
181
182];
183
184sub new
185{
186 my $class = shift (@_);
187 my ($mode,$argv,$options,$opt_listall_options) = @_;
188
189 my $self = { 'xml' => 0, 'mode' => $mode };
190
191 # general options available to all plugins
192 my $arguments = $options->{'args'};
193 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
194 # Parse returns -1 if something has gone wrong
195 if ($intArgLeftinAfterParsing == -1)
196 {
197 &PrintUsage::print_txt_usage($options, "{import.params}",1);
198 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
199 die "\n";
200 }
201
202 my $language = $self->{'language'};
203 # If $language has been specified, load the appropriate resource bundle
204 # (Otherwise, the default resource bundle will be loaded automatically)
205 if ($language && $language =~ /\S/) {
206 &gsprintf::load_language_specific_resource_bundle($language);
207 }
208
209 if ($self->{'listall'}) {
210 if ($self->{'xml'}) {
211 &PrintUsage::print_xml_usage($opt_listall_options);
212 }
213 else
214 {
215 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
216 }
217 die "\n";
218 }
219
220 if ($self->{'xml'}) {
221 &PrintUsage::print_xml_usage($options);
222 print "\n";
223 return bless $self, $class;
224 }
225
226 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
227 &gsprintf::output_strings_in_UTF8;
228 }
229
230 # If the user specified -h, then we output the usage
231 if (@$argv && $argv->[0] =~ /^\-+h/) {
232 &PrintUsage::print_txt_usage($options, "{import.params}");
233 die "\n";
234 }
235 # now check that we had exactly one leftover arg, which should be
236 # the collection name. We don't want to do this earlier, cos
237 # -xml arg doesn't need a collection name
238
239 if ($intArgLeftinAfterParsing != 1 )
240 {
241 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
242 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
243 die "\n";
244 }
245
246 $self->{'close_out'} = 0;
247 my $out = $self->{'out'};
248 if ($out !~ /^(STDERR|STDOUT)$/i) {
249 open (OUT, ">$out") ||
250 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
251 $out = 'inexport::OUT';
252 $self->{'close_out'} = 1;
253 }
254 $out->autoflush(1);
255 $self->{'out'} = $out;
256
257 # @ARGV should be only one item, the name of the collection
258 $self->{'collection'} = shift @$argv;
259
260 # Unless otherwise stated all manifests are considered version 1---where
261 # they act more like an advanced process expression---as compared to newer
262 # manifest files that act as an explicit (and exhaustive) list of files to
263 # process [jmt12]
264 $self->{'manifest_version'} = 1;
265
266 return bless $self, $class;
267}
268
269# Simplified version of the contstructor for use with CGI scripts
270sub newCGI
271{
272 my $class = shift (@_);
273 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
274
275 my $self = { 'xml' => 0, 'mode' => $mode };
276
277 $self->{'out'} = STDERR;
278
279 if (defined $gsdl_cgi) {
280 $self->{'site'} = $opt_site;
281 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
282 $self->{'collectdir'} = $collect_dir;
283 }
284 else {
285 $self->{'site'} = "";
286 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
287 }
288 $self->{'faillog'} = "";
289
290 $self->{'collection'} = $collect;
291
292 return bless $self, $class;
293}
294sub get_collection
295{
296 my $self = shift @_;
297
298 return $self->{'collection'};
299}
300
301
302sub read_collection_cfg
303{
304 my $self = shift @_;
305 my ($collection,$options) = @_;
306
307 my $collectdir = $self->{'collectdir'};
308 my $site = $self->{'site'};
309 my $out = $self->{'out'};
310
311 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
312 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
313 die "\n";
314 }
315
316 # set gs_version 2/3
317 $self->{'gs_version'} = "2";
318 if ((defined $site) && ($site ne "")) {
319 # gs3
320 $self->{'gs_version'} = "3";
321 }
322
323 # add collection's perllib dir into include path in
324 # case we have collection specific modules
325 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
326
327 # check that we can open the faillog
328 my $faillog = $self->{'faillog'};
329 if ($faillog eq "") {
330 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
331 }
332 open (FAILLOG, ">$faillog") ||
333 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
334
335
336 my $faillogname = $faillog;
337 $faillog = 'inexport::FAILLOG';
338 $faillog->autoflush(1);
339 $self->{'faillog'} = $faillog;
340 $self->{'faillogname'} = $faillogname;
341 $self->{'close_faillog'} = 1;
342
343 # Read in the collection configuration file.
344 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
345 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
346 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
347
348 return ($config_filename,$collectcfg);
349}
350
351sub set_collection_options
352{
353 my $self = shift @_;
354 my ($collectcfg) = @_;
355
356 my $inexport_mode = $self->{'mode'};
357
358 my $importdir = $self->{'importdir'};
359 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
360 my $out = $self->{'out'};
361
362 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
363 if (!defined($collectcfg->{'infodbtype'}))
364 {
365 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
366 }
367 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
368 # we can't use the text version for archives dbs.
369 $collectcfg->{'infodbtype'} = "gdbm";
370 }
371
372 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
373 $importdir = $collectcfg->{'importdir'};
374 }
375
376 if ($inexport_mode eq "import") {
377 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
378 $archivedir = $collectcfg->{'archivedir'};
379 }
380 }
381 elsif ($inexport_mode eq "export") {
382 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
383 $archivedir = $collectcfg->{'exportdir'};
384 }
385 }
386 # fill in the default import and archives directories if none
387 # were supplied, turn all \ into / and remove trailing /
388 if (!&FileUtils::isFilenameAbsolute($importdir))
389 {
390 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
391 }
392 else
393 {
394 # Don't do this - it kills protocol prefixes
395 #$importdir =~ s/[\\\/]+/\//g;
396 #$importdir =~ s/\/$//;
397 # Do this instead
398 &FileUtils::sanitizePath($importdir);
399 }
400 if (!&FileUtils::directoryExists($importdir))
401 {
402 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
403 die "\n";
404 }
405 $self->{'importdir'} = $importdir;
406
407 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
408 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
409 }
410 else {
411
412 $archivedir = &FileUtils::sanitizePath($archivedir);
413 }
414 $self->{'archivedir'} = $archivedir;
415
416 if (defined $self->{'default_verbosity'}) {
417 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
418 $self->{'verbosity'} = $collectcfg->{'verbosity'};
419 }
420 }
421
422 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
423 $self->{'manifest'} = $collectcfg->{'manifest'};
424 }
425
426 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
427 if ($collectcfg->{'gzip'} =~ /^true$/i) {
428 $self->{'gzip'} = 1;
429 }
430 }
431
432 if (defined $self->{'default_maxdocs'}) {
433 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
434 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
435 }
436 }
437
438
439
440 if (defined $self->{'default_OIDtype'} ) {
441 if (defined $collectcfg->{'OIDtype'}
442 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
443 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
444 }
445 }
446
447 if (defined $self->{'default_OIDmetadata'}) {
448 if (defined $collectcfg->{'OIDmetadata'}) {
449 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
450 }
451 }
452
453 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
454 $self->{'debug'} = 1;
455 }
456 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
457 $self->{'gli'} = 1;
458 }
459 $self->{'gli'} = 0 unless defined $self->{'gli'};
460
461 # check keepold and removeold
462 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
463
464 my ($removeold, $keepold, $incremental, $incremental_mode)
465 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
466 $self->{'incremental'}, $checkdir,
467 $collectcfg);
468
469 $self->{'removeold'} = $removeold;
470 $self->{'keepold'} = $keepold;
471 $self->{'incremental'} = $incremental;
472 $self->{'incremental_mode'} = $incremental_mode;
473
474 # Since this wasted my morning, let's at least warn a user that manifest
475 # files now *only* work if keepold is set [jmt12]
476 if ($self->{'manifest'} && !$self->{'keepold'})
477 {
478 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
479 }
480 }
481
482sub process_files
483{
484 my $self = shift @_;
485 my ($config_filename,$collectcfg) = @_;
486
487 my $inexport_mode = $self->{'mode'};
488
489 my $verbosity = $self->{'verbosity'};
490 my $debug = $self->{'debug'};
491
492 my $importdir = $self->{'importdir'};
493 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
494
495 my $incremental = $self->{'incremental'};
496 my $incremental_mode = $self->{'incremental_mode'};
497
498 my $gs_version = $self->{'gs_version'};
499
500 my $removeold = $self->{'removeold'};
501 my $keepold = $self->{'keepold'};
502
503 my $saveas = $self->{'saveas'};
504 my $saveas_options = $self->{'saveas_options'};
505 my $OIDtype = $self->{'OIDtype'};
506 my $OIDmetadata = $self->{'OIDmetadata'};
507
508 my $out = $self->{'out'};
509 my $faillog = $self->{'faillog'};
510
511 my $maxdocs = $self->{'maxdocs'};
512 my $gzip = $self->{'gzip'};
513 my $groupsize = $self->{'groupsize'};
514 my $sortmeta = $self->{'sortmeta'};
515
516 my $removeprefix = $self->{'removeprefix'};
517 my $removesuffix = $self->{'removesuffix'};
518
519 my $gli = $self->{'gli'};
520
521 # related to export
522 my $xsltfile = $self->{'xsltfile'};
523 my $group_marc = $self->{'group_marc'};
524 my $mapping_file = $self->{'mapping_file'};
525 my $xslt_mets = $self->{'xslt_mets'};
526 my $xslt_txt = $self->{'xslt_txt'};
527 my $fedora_namespace = $self->{'fedora_namespace'};
528 my $metadata_prefix = $self->{'metadata_prefix'};
529
530 if ($inexport_mode eq "import") {
531 print STDERR "<Import>\n" if $gli;
532 }
533 else {
534 print STDERR "<export>\n" if $gli;
535 }
536
537 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
538 if ($self->{'manifest'} ne "") {
539 my $manifest_filename = $self->{'manifest'};
540
541 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
542 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
543 }
544
545 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
546 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
547 #$self->{'manifest'} =~ s/\/$//;
548
549 $manifest_lookup->parse($manifest_filename);
550
551 # manifests may now include a version number [jmt12]
552 $self->{'manifest_version'} = $manifest_lookup->get_version();
553 }
554
555 my $manifest = $self->{'manifest'};
556
557 # load all the plugins
558 my $plugins = [];
559 if (defined $collectcfg->{'plugin'}) {
560 $plugins = $collectcfg->{'plugin'};
561 }
562
563 my $plugin_incr_mode = $incremental_mode;
564 if ($manifest ne "") {
565 # if we have a manifest file, then we pretend we are fully incremental for plugins
566 $plugin_incr_mode = "all";
567 }
568 #some global options for the plugins
569 my @global_opts = ();
570
571 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
572 if (scalar(@$pluginfo) == 0) {
573 &gsprintf($out, "{import.no_plugins_loaded}\n");
574 die "\n";
575 }
576
577 # remove the old contents of the archives directory (and tmp
578 # directory) if needed
579
580 if ($removeold) {
581 if (&FileUtils::directoryExists($archivedir)) {
582 &gsprintf($out, "{import.removing_archives}\n");
583 &FileUtils::removeFilesRecursive($archivedir);
584 }
585 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
586 $tmpdir =~ s/[\\\/]+/\//g;
587 $tmpdir =~ s/\/$//;
588 if (&FileUtils::directoryExists($tmpdir)) {
589 &gsprintf($out, "{import.removing_tmpdir}\n");
590 &FileUtils::removeFilesRecursive($tmpdir);
591 }
592 }
593
594 # create the archives dir if needed
595 &FileUtils::makeAllDirectories($archivedir);
596
597 # read the archive information file
598
599 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
600 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
601 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
602
603 # When we make these initial calls to determine the archive information doc
604 # and src databases we pass through a '1' to indicate this is the first
605 # time we are referring to these databases. When using dynamic dbutils
606 # (available in extensions) this indicates to some database types (for
607 # example, persistent servers) that this is a good time to perform any
608 # one time initialization. The argument has no effect on vanilla dbutils
609 # [jmt12]
610 my $perform_firsttime_init = 1;
611 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
612 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
613
614 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
615 $archive_info->load_info ($arcinfo_doc_filename);
616
617 if ($manifest eq "") {
618 # Load in list of files in import folder from last import (if present)
619 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
620 }
621
622 ####Use Plugout####
623 my $plugout;
624
625 my $generate_auxiliary_files = 0;
626 if ($inexport_mode eq "import") {
627 $generate_auxiliary_files = 1;
628 }
629 elsif ($self->{'include_auxiliary_database_files'}) {
630 $generate_auxiliary_files = 1;
631 }
632 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
633
634 # Option to use user defined plugout
635 if ($inexport_mode eq "import") {
636 if (defined $collectcfg->{'plugout'}) {
637 # If a plugout was specified in the collect.cfg file, assume it is sensible
638 # We can't check the name because it could be anything, if it is a custom plugout
639 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
640 $plugout = $collectcfg->{'plugout'};
641 }
642 else {
643 push @$plugout,$saveas."Plugout";
644 }
645
646 }
647 else {
648 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
649 $plugout = $collectcfg->{'plugout'};
650 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
651 }
652 else {
653 push @$plugout,$saveas."Plugout";
654 }
655 }
656
657 my $plugout_name = $plugout->[0];
658
659 if ($inexport_mode eq "export" && defined $saveas_options) {
660 my @user_plugout_options = split(" ", $saveas_options);
661 push @$plugout, @user_plugout_options;
662 }
663 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
664 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
665 push @$plugout,("-debug") if ($debug);
666 push @$plugout,("-gzip_output") if ($gzip);
667 push @$plugout,("-output_handle",$out) if (defined $out);
668
669 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
670 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
671 if ($inexport_mode eq "import") {
672 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
673 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
674 }
675 }
676 my $processor = &plugout::load_plugout($plugout);
677 $processor->setoutputdir ($archivedir);
678 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
679 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
680 $processor->begin();
681 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
682
683 if ($removeold) {
684 # occasionally, plugins may want to do something on remove
685 # old, eg pharos image indexing
686 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
687 }
688
689 # process the import directory
690 my $block_hash = {};
691 $block_hash->{'new_files'} = {};
692 $block_hash->{'reindex_files'} = {};
693 # all of these are set somewhere else, so it's more readable to define them
694 # here [jmt12]
695 $block_hash->{'all_files'} = {};
696 $block_hash->{'deleted_files'} = {};
697 $block_hash->{'file_blocks'} = {};
698 $block_hash->{'metadata_files'} = {};
699 $block_hash->{'shared_fileroot'} = '';
700 # a new flag so we can tell we had a manifest way down in the plugins
701 # [jmt12]
702 $block_hash->{'manifest'} = 'false';
703 my $metadata = {};
704
705 # global blocking pass may set up some metadata
706 # - when we have a newer manifest file we don't do this -unless- the
707 # collection configuration indicates this collection contains complex
708 # (inherited) metadata [jmt12]
709 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
710 {
711 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
712 }
713 else
714 {
715 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
716 }
717
718 if ($manifest ne "") {
719
720 # mark that we are using a manifest - information that might be needed
721 # down in plugins (for instance DirectoryPlugin)
722 $block_hash->{'manifest'} = $self->{'manifest_version'};
723
724 #
725 # 1. Process delete files first
726 #
727 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
728 my @full_deleted_files = ();
729
730 # ensure all filenames are absolute
731 foreach my $df (@deleted_files) {
732 my $full_df =
733 (&FileUtils::isFilenameAbsolute($df))
734 ? $df
735 : &FileUtils::filenameConcatenate($importdir,$df);
736
737 if (-d $full_df) {
738 &add_dir_contents_to_list($full_df, \@full_deleted_files);
739 } else {
740 push(@full_deleted_files,$full_df);
741 }
742 }
743
744 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
745 mark_docs_for_deletion($archive_info,{},
746 \@full_deleted_files,
747 $archivedir, $verbosity, "delete");
748
749
750 #
751 # 2. Now files for reindexing
752 #
753
754 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
755 my @full_reindex_files = ();
756 # ensure all filenames are absolute
757 foreach my $rf (@reindex_files) {
758 my $full_rf =
759 (&FileUtils::isFilenameAbsolute($rf))
760 ? $rf
761 : &FileUtils::filenameConcatenate($importdir,$rf);
762
763 if (-d $full_rf) {
764 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
765 } else {
766 push(@full_reindex_files,$full_rf);
767 }
768 }
769
770 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
771 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
772
773 # And now to ensure the new version of the file processed by
774 # appropriate plugin, we need to add it to block_hash reindex list
775 foreach my $full_rf (@full_reindex_files) {
776 $block_hash->{'reindex_files'}->{$full_rf} = 1;
777 }
778
779
780 #
781 # 3. Now finally any new files - add to block_hash new_files list
782 #
783
784 my @new_files = keys %{$manifest_lookup->{'index'}};
785 my @full_new_files = ();
786
787 foreach my $nf (@new_files) {
788 # ensure filename is absolute
789 my $full_nf =
790 (&FileUtils::isFilenameAbsolute($nf))
791 ? $nf
792 : &FileUtils::filenameConcatenate($importdir,$nf);
793
794 if (-d $full_nf) {
795 &add_dir_contents_to_list($full_nf, \@full_new_files);
796 } else {
797 push(@full_new_files,$full_nf);
798 }
799 }
800
801 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
802 # need to check this file exists before trying to read it - in the past
803 # it wasn't possible to have a manifest unless keepold was also set so
804 # you were pretty much guarenteed arcinfo existed
805 # [jmt12]
806 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
807 if (-e $arcinfo_src_filename)
808 {
809 my $arcinfodb_map = {};
810 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
811 foreach my $f (@full_new_files) {
812 my $rel_f = &util::abspath_to_placeholders($f);
813
814 # check that we haven't seen it already
815 if (defined $arcinfodb_map->{$rel_f}) {
816 # TODO make better warning
817 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
818 } else {
819 $block_hash->{'new_files'}->{$f} = 1;
820 }
821 }
822
823 undef $arcinfodb_map;
824 }
825 # no existing files - so we can just add all the files [jmt12]
826 else
827 {
828 foreach my $f (@full_new_files)
829 {
830 $block_hash->{'new_files'}->{$f} = 1;
831 }
832 }
833
834 # If we are not using complex inherited metadata (and thus have skipped
835 # the global file scan) we need to at least check for a matching
836 # metadata.xml for the files being indexed/reindexed
837 # - unless we are using the newer version of Manifests, which are treated
838 # verbatim, and should have a metadata element for metadata files (so
839 # we can explicitly process metadata files other than metadata.xml)
840 # [jmt12]
841 if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
842 {
843 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
844 foreach my $file_to_import (@all_files_to_import)
845 {
846 my $metadata_xml_path = $file_to_import;
847 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
848 if (&FileUtils::fileExists($metadata_xml_path))
849 {
850 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
851 }
852 }
853 }
854
855 # new version manifest files explicitly list metadata files to be
856 # processed (ignoring complexmeta if set)
857 # [jmt12]
858 if ($self->{'manifest_version'} > 1)
859 {
860 # Process metadata files
861 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
862 {
863 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
864 }
865 }
866 }
867 else {
868 # if incremental, we read through the import folder to see whats changed.
869
870 if ($incremental || $incremental_mode eq "onlyadd") {
871 prime_doc_oid_count($archivedir);
872
873 # Can now work out which files were new, already existed, and have
874 # been deleted
875
876 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
877 $archivedir,$verbosity,$incremental_mode);
878
879 my @new_files = sort keys %{$block_hash->{'new_files'}};
880 if (scalar(@new_files>0)) {
881 print STDERR "New files and modified metadata files since last import:\n ";
882 print STDERR join("\n ",@new_files), "\n";
883 }
884
885 if ($incremental) {
886 # only look for deletions if we are truely incremental
887 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
888 # Filter out any in gsdl/tmp area
889 my @filtered_deleted_files = ();
890 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
891 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
892 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
893 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
894
895 foreach my $df (@deleted_files) {
896 next if ($df =~ m/^$gsdl_tmp_area/);
897 next if ($df =~ m/^$collect_tmp_area/);
898
899 push(@filtered_deleted_files,$df);
900 }
901
902
903 @deleted_files = @filtered_deleted_files;
904
905 if (scalar(@deleted_files)>0) {
906 print STDERR "Files deleted since last import:\n ";
907 print STDERR join("\n ",@deleted_files), "\n";
908
909
910 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
911
912 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
913 }
914
915 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
916
917 if (scalar(@reindex_files)>0) {
918 print STDERR "Files to reindex since last import:\n ";
919 print STDERR join("\n ",@reindex_files), "\n";
920 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
921 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
922 }
923
924 }
925 }
926 }
927
928 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
929 # Do nothing if the file already exists (file exists on incremental build).
930 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
931 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
932 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
933 # oailastmodified and oailastmodifieddate
934 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
935 if ($self->{'generate_auxiliary_files'}) {
936 if (!-f $earliestDatestampFile && -d $archivedir) {
937 my $current_time_in_seconds = time; # in seconds
938
939 if(open(FOUT, ">$earliestDatestampFile")) {
940 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
941 print FOUT $current_time_in_seconds;
942 close(FOUT);
943 }
944 else {
945 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
946 }
947
948 }
949 }
950
951 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
952
953 if ($saveas eq "FedoraMETS") {
954 # create collection "doc obj" for Fedora that contains
955 # collection-level metadata
956
957 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
958 $doc_obj->set_OID("collection");
959
960 my $col_name = undef;
961 my $col_meta = $collectcfg->{'collectionmeta'};
962
963 if (defined $col_meta) {
964 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
965 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
966 }
967 $processor->process($doc_obj);
968 }
969
970 &plugin::end($pluginfo, $processor);
971
972 &plugin::deinit($pluginfo, $processor);
973
974 # Store the value of OIDCount (used in doc.pm) so it can be
975 # restored correctly to this value on an incremental build
976 # - this OIDcount file should only be generated for numerical oids [jmt12]
977 if ($self->{'OIDtype'} eq 'incremental')
978 {
979 store_doc_oid_count($archivedir);
980 }
981
982 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
983 $processor->close_group_output() if $processor->is_group();
984
985# if ($inexport_mode eq "import") {
986 if ($self->{'generate_auxiliary_files'}) {
987 # write out the archive information file
988 # for backwards compatability with archvies.inf file
989 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
990 $archive_info->save_info($arcinfo_doc_filename);
991 }
992 else {
993 $archive_info->save_revinfo_db($arcinfo_src_filename);
994 }
995 }
996 return $pluginfo;
997}
998
999# @function perform_process_files()
1000# while process_files() above prepares the system to import files this is the
1001# function that actually initiates the plugin pipeline to process the files.
1002# This function the therefore be overridden in subclasses of inexport.pm should
1003# they wish to do different or further processing
1004# @author jmt12
1005sub perform_process_files
1006{
1007 my $self = shift(@_);
1008 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1009 my $gli = $self->{'gli'};
1010 # specific file to process - via manifest version 2+
1011 if ($file_to_import ne '')
1012 {
1013 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1014 }
1015 # global file scan - if we are using a new version manifest, files would have
1016 # been read above. Older manifests use extra settings in the $block_hash to
1017 # control what is imported, while non-manifest imports use a regular
1018 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1019 elsif ($manifest eq '' || $self->{'manifest_version'} < 1)
1020 {
1021 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1022 }
1023 else
1024 {
1025 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1026 }
1027}
1028# perform_process_files()
1029
1030# @function generate_statistics()
1031sub generate_statistics
1032{
1033 my $self = shift @_;
1034 my ($pluginfo) = @_;
1035
1036 my $inexport_mode = $self->{'mode'};
1037 my $out = $self->{'out'};
1038 my $faillogname = $self->{'faillogname'};
1039 my $gli = $self->{'gli'};
1040
1041 &gsprintf($out, "\n");
1042 &gsprintf($out, "*********************************************\n");
1043 &gsprintf($out, "{$inexport_mode.complete}\n");
1044 &gsprintf($out, "*********************************************\n");
1045
1046 &plugin::write_stats($pluginfo, 'STDERR', $faillogname, $gli);
1047}
1048# generate_statistics()
1049
1050
1051# @function deinit()
1052# Close down any file handles that we opened (and hence are responsible for
1053# closing
1054sub deinit
1055{
1056 my $self = shift(@_);
1057 close OUT if $self->{'close_out'};
1058 close FAILLOG if $self->{'close_faillog'};
1059}
1060# deinit()
1061
1062
1063sub store_collectionmeta
1064{
1065 my ($collectionmeta,$field,$doc_obj) = @_;
1066
1067 my $section = $doc_obj->get_top_section();
1068
1069 my $field_hash = $collectionmeta->{$field};
1070
1071 foreach my $k (keys %$field_hash)
1072 {
1073 my $val = $field_hash->{$k};
1074
1075 ### print STDERR "*** $k = $field_hash->{$k}\n";
1076
1077 my $md_label = "ex.$field";
1078
1079
1080 if ($k =~ m/^\[l=(.*?)\]$/)
1081 {
1082
1083 my $md_suffix = $1;
1084 $md_label .= "^$md_suffix";
1085 }
1086
1087
1088 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1089
1090 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1091 # while "collectionname" in GS2 is called "name" in GS3.
1092 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1093 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1094 {
1095 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1096 }
1097
1098 }
1099}
1100
1101
1102sub oid_count_file {
1103 my ($archivedir) = @_;
1104 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1105}
1106
1107
1108sub prime_doc_oid_count
1109{
1110 my ($archivedir) = @_;
1111 my $oid_count_filename = &oid_count_file($archivedir);
1112
1113 if (-e $oid_count_filename) {
1114 if (open(OIDIN,"<$oid_count_filename")) {
1115 my $OIDcount = <OIDIN>;
1116 chomp $OIDcount;
1117 close(OIDIN);
1118
1119 $doc::OIDcount = $OIDcount;
1120 }
1121 else {
1122 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1123 }
1124 }
1125
1126}
1127
1128sub store_doc_oid_count
1129{
1130 # Use the file "OIDcount" in the archives directory to record
1131 # what value doc.pm got up to
1132
1133 my ($archivedir) = @_;
1134 my $oid_count_filename = &oid_count_file($archivedir);
1135
1136 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1137 if (open(OIDOUT,">$oid_count_filename")) {
1138 print OIDOUT $doc::OIDcount, "\n";
1139
1140 close(OIDOUT);
1141 }
1142 else {
1143 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1144 }
1145}
1146
1147
1148
1149sub new_vs_old_import_diff
1150{
1151 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1152
1153 # Get the infodbtype value for this collection from the arcinfo object
1154 my $infodbtype = $archive_info->{'infodbtype'};
1155
1156 # in this method, we want to know if metadata files are modified or not.
1157 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1158
1159 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1160
1161 # First convert all files to absolute form
1162 # This is to support the situation where the import folder is not
1163 # the default
1164
1165 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1166 my $full_prev_all_files = {};
1167
1168 foreach my $prev_file (keys %$prev_all_files) {
1169
1170 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1171 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1172 $full_prev_all_files->{$full_prev_file} = $prev_file;
1173 }
1174 else {
1175 $full_prev_all_files->{$prev_file} = $prev_file;
1176 }
1177 }
1178
1179
1180 # Figure out which are the new files, existing files and so
1181 # by implication the files from the previous import that are not
1182 # there any more => mark them for deletion
1183 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1184
1185 my $full_curr_file = $curr_file;
1186
1187 # entry in 'all_files' is moved to either 'existing_files',
1188 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1189
1190 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1191 # add in import dir to make absolute
1192 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1193 }
1194
1195 # figure out if new file or not
1196 if (defined $full_prev_all_files->{$full_curr_file}) {
1197 # delete it so that only files that need deleting are left
1198 delete $full_prev_all_files->{$full_curr_file};
1199
1200 # had it before. is it a metadata file?
1201 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1202
1203 # is it modified??
1204 if (-M $full_curr_file < $archiveinf_timestamp) {
1205 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1206 # its newer than last build
1207 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1208 }
1209 }
1210 else {
1211 if ($incremental_mode eq "all") {
1212
1213 # had it before
1214 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1215
1216 }
1217 else {
1218 # Warning in "onlyadd" mode, but had it before!
1219 print STDERR "Warning: File $full_curr_file previously imported.\n";
1220 print STDERR " Treating as new file\n";
1221
1222 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1223
1224 }
1225 }
1226 }
1227 else {
1228 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1229 # the new file is the special sort of file greenstone uses
1230 # to attach metadata to src documents
1231 # i.e metadata.xml
1232 # (but note, the filename used is not constrained in
1233 # Greenstone to always be this)
1234
1235 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1236 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1237 }
1238 else {
1239 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1240 }
1241 }
1242
1243
1244 delete $block_hash->{'all_files'}->{$curr_file};
1245 }
1246
1247
1248
1249
1250 # Deal with complication of new or modified metadata files by forcing
1251 # everything from this point down in the file hierarchy to
1252 # be freshly imported.
1253 #
1254 # This may mean files that have not changed are reindexed, but does
1255 # guarantee by the end of processing all new metadata is correctly
1256 # associated with the relevant document(s).
1257
1258 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1259 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1260
1261 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1262 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1263
1264 # Go through existing_files, and mark anything that is contained
1265 # within 'situated_dir' to be reindexed (in case some of the metadata
1266 # attaches to one of these files)
1267
1268 my $reindex_files = [];
1269
1270 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1271
1272 if ($existing_f =~ m/^$situated_dir/) {
1273
1274 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1275
1276 push(@$reindex_files,$existing_f);
1277 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1278 delete $block_hash->{'existing_files'}->{$existing_f};
1279
1280 }
1281 }
1282
1283 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1284 # (or equivalent)
1285 $block_hash->{'new_files'}->{$new_mdf} = 1;
1286
1287 }
1288
1289 # go through remaining existing files and work out what has changed and needs to be reindexed.
1290 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1291
1292 my $reindex_files = [];
1293
1294 foreach my $existing_filename (@existing_files) {
1295 if (-M $existing_filename < $archiveinf_timestamp) {
1296 # file is newer than last build
1297
1298 my $existing_file = $existing_filename;
1299 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1300
1301 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1302 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1303
1304 print STDERR "**** Reindexing existing file: $existing_file\n";
1305
1306 push(@$reindex_files,$existing_file);
1307 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1308 }
1309
1310 }
1311
1312
1313 # By this point full_prev_all_files contains the files
1314 # mentioned in archiveinf-src.db but are not in the 'import'
1315 # folder (or whatever was specified through -importdir ...)
1316
1317 # This list can contain files that were created in the 'tmp' or
1318 # 'cache' areas (such as screen-size and thumbnail images).
1319 #
1320 # In building the final list of files to delete, we test to see if
1321 # it exists on the filesystem and if it does (unusual for a "normal"
1322 # file in import, but possible in the case of 'tmp' files),
1323 # supress it from going into the final list
1324
1325 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1326
1327 my @deleted_files = values %$full_prev_all_files;
1328 map { my $curr_file = $_;
1329 my $full_curr_file = $curr_file;
1330
1331 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1332 # add in import dir to make absolute
1333
1334 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1335 }
1336
1337
1338 if (!-e $full_curr_file) {
1339 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1340 }
1341 } @deleted_files;
1342
1343
1344
1345}
1346
1347
1348# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1349# $mode is 'delete' or 'reindex'
1350sub mark_docs_for_deletion
1351{
1352 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1353
1354 my $mode_text = "deleted from index";
1355 if ($mode eq "reindex") {
1356 $mode_text = "reindexed";
1357 }
1358
1359 # Get the infodbtype value for this collection from the arcinfo object
1360 my $infodbtype = $archive_info->{'infodbtype'};
1361
1362 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1363 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1364
1365
1366 # record files marked for deletion in arcinfo
1367 foreach my $file (@$deleted_files) {
1368 # use 'archiveinf-src' info database file to look up all the OIDs
1369 # that this file is used in (note in most cases, it's just one OID)
1370
1371 my $relfile = &util::abspath_to_placeholders($file);
1372
1373 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $relfile);
1374 my $oids = $src_rec->{'oid'};
1375 my $file_record_deleted = 0;
1376
1377 # delete the src record
1378 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1379 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $relfile);
1380 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1381
1382
1383 foreach my $oid (@$oids) {
1384
1385 # find the source doc (the primary file that becomes this oid)
1386 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1387 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1388 $doc_source_file = &util::placeholders_to_abspath($doc_source_file);
1389
1390 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1391 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1392 }
1393
1394 if ($doc_source_file ne $file) {
1395 # its an associated or metadata file
1396
1397 # mark source doc for reimport as one of its assoc files has changed or deleted
1398 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1399
1400 }
1401 my $curr_status = $archive_info->get_status_info($oid);
1402 if (defined($curr_status) && (($curr_status ne "D"))) {
1403 if ($verbosity>1) {
1404 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1405 }
1406 # mark oid for deletion (it will be deleted or reimported)
1407 $archive_info->set_status_info($oid,"D");
1408 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1409 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1410
1411 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1412 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1413
1414 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1415 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1416 }
1417 }
1418
1419 }
1420
1421 # now go through and check that we haven't marked any primary
1422 # files for reindex (because their associated files have
1423 # changed/deleted) when they have been deleted themselves. only in
1424 # delete mode.
1425
1426 if ($mode eq "delete") {
1427 foreach my $file (@$deleted_files) {
1428 if (defined $block_hash->{'reindex_files'}->{$file}) {
1429 delete $block_hash->{'reindex_files'}->{$file};
1430 }
1431 }
1432 }
1433
1434
1435}
1436
1437sub add_dir_contents_to_list {
1438
1439 my ($dirname, $list) = @_;
1440
1441 # Recur over directory contents.
1442 my (@dir, $subfile);
1443
1444 # find all the files in the directory
1445 if (!opendir (DIR, $dirname)) {
1446 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1447 return -1; # error in processing
1448 }
1449 @dir = readdir (DIR);
1450 closedir (DIR);
1451
1452 for (my $i = 0; $i < scalar(@dir); $i++) {
1453 my $subfile = $dir[$i];
1454 next if ($subfile =~ m/^\.\.?$/);
1455 next if ($subfile =~ /^\.svn$/);
1456 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1457 if (-d $full_file) {
1458 &add_dir_contents_to_list($full_file, $list);
1459 } else {
1460 push (@$list, $full_file);
1461 }
1462 }
1463
1464}
1465
1466
14671;
Note: See TracBrowser for help on using the repository browser.