source: main/trunk/greenstone2/perllib/inexport.pm@ 28077

Last change on this file since 28077 was 28077, checked in by ak19, 11 years ago

Fix noticed when doing diffcol. When the import option saveas was set to GreenstoneMETS in collect.cfg, this was ignored, although passing it as a cmdline option to import.pl worked. The reason was that the saveas option had a default assigned, unlike OIDtype, and if it already had a value, then any value in collect.cfg was never consulted. Now the METS tutorial collection should work from the cmdline.

  • Property svn:executable set to *
File size: 45.0 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 # Unless otherwise stated all manifests are considered version 1---where
122 # they act more like an advanced process expression---as compared to newer
123 # manifest files that act as an explicit (and exhaustive) list of files to
124 # process [jmt12]
125 $self->{'manifest_version'} = 1;
126
127 return bless $self, $class;
128}
129
130# Simplified version of the contstructor for use with CGI scripts
131sub newCGI
132{
133 my $class = shift (@_);
134 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
135
136 my $self = { 'xml' => 0, 'mode' => $mode };
137
138 $self->{'out'} = STDERR;
139
140 if (defined $gsdl_cgi) {
141 $self->{'site'} = $opt_site;
142 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
143 $self->{'collectdir'} = $collect_dir;
144 }
145 else {
146 $self->{'site'} = "";
147 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
148 }
149 $self->{'faillog'} = "";
150
151 $self->{'collection'} = $collect;
152
153 return bless $self, $class;
154}
155sub get_collection
156{
157 my $self = shift @_;
158
159 return $self->{'collection'};
160}
161
162
163sub read_collection_cfg
164{
165 my $self = shift @_;
166 my ($collection,$options) = @_;
167
168 my $collectdir = $self->{'collectdir'};
169 my $site = $self->{'site'};
170 my $out = $self->{'out'};
171
172 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
173 &PrintUsage::print_txt_usage($options, "{import.params}");
174 die "\n";
175 }
176
177 # set gs_version 2/3
178 $self->{'gs_version'} = "2";
179 if ((defined $site) && ($site ne "")) {
180 # gs3
181 $self->{'gs_version'} = "3";
182 }
183
184 # add collection's perllib dir into include path in
185 # case we have collection specific modules
186 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
187
188 # check that we can open the faillog
189 my $faillog = $self->{'faillog'};
190 if ($faillog eq "") {
191 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
192 }
193 open (FAILLOG, ">$faillog") ||
194 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
195
196
197 my $faillogname = $faillog;
198 $faillog = 'inexport::FAILLOG';
199 $faillog->autoflush(1);
200 $self->{'faillog'} = $faillog;
201 $self->{'faillogname'} = $faillogname;
202 $self->{'close_faillog'} = 1;
203
204 # Read in the collection configuration file.
205 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
206 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
207 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
208
209 return ($config_filename,$collectcfg);
210}
211
212sub set_collection_options
213{
214 my $self = shift @_;
215 my ($collectcfg) = @_;
216
217 my $inexport_mode = $self->{'mode'};
218
219 my $verbosity = $self->{'verbosity'};
220 my $debug = $self->{'debug'};
221 my $importdir = $self->{'importdir'};
222 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
223 my $out = $self->{'out'};
224
225 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
226 if (!defined($collectcfg->{'infodbtype'}))
227 {
228 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
229 }
230 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
231 # we can't use the text version for archives dbs.
232 $collectcfg->{'infodbtype'} = "gdbm";
233 }
234
235 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
236 $importdir = $collectcfg->{'importdir'};
237 }
238 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
239 $archivedir = $collectcfg->{'archivedir'};
240 }
241 # fill in the default import and archives directories if none
242 # were supplied, turn all \ into / and remove trailing /
243 if ($importdir eq "")
244 {
245 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "import");
246 }
247 else
248 {
249 # Don't do this - it kills protocol prefixes
250 #$importdir =~ s/[\\\/]+/\//g;
251 #$importdir =~ s/\/$//;
252 # Do this instead
253 &FileUtils::sanitizePath($importdir);
254 }
255 if (!&FileUtils::directoryExists($importdir))
256 {
257 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
258 die "\n";
259 }
260 $self->{'importdir'} = $importdir;
261
262 if ($archivedir eq "") {
263 if ($inexport_mode eq "import") {
264 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
265 }
266 elsif ($inexport_mode eq "export") {
267 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
268 }
269 else {
270 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
271 print STDERR " Defaulting to 'archives' for file output\n";
272 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
273 }
274 }
275
276 $archivedir = &FileUtils::sanitizePath($archivedir);
277 #$archivedir =~ s/[\\\/]+/\//g;
278 #$archivedir =~ s/\/$//;
279 $self->{'archivedir'} = $archivedir;
280
281 if ($verbosity !~ /\d+/) {
282 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
283 $verbosity = $collectcfg->{'verbosity'};
284 } else {
285 $verbosity = 2; # the default
286 }
287 }
288 $self->{'verbosity'} = $verbosity;
289
290 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
291 $self->{'manifest'} = $collectcfg->{'manifest'};
292 }
293
294 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
295 if ($collectcfg->{'gzip'} =~ /^true$/i) {
296 $self->{'gzip'} = 1;
297 }
298 }
299
300 if ($self->{'maxdocs'} !~ /\-?\d+/) {
301 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
302 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
303 } else {
304 $self->{'maxdocs'} = -1; # the default
305 }
306 }
307
308 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
309 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
310 $self->{'groupsize'} = $collectcfg->{'groupsize'};
311 }
312 }
313
314 if (!defined $self->{'saveas'}
315 || ($self->{'saveas'} !~ /^(GreenstoneXML|GreenstoneMETS)$/ )) {
316 # saveas was either not defined on the command-line, or it was not one of the recognized values
317 if (defined $collectcfg->{'saveas'}
318 && $collectcfg->{'saveas'} =~ /^(GreenstoneXML|GreenstoneMETS)$/) {
319 $self->{'saveas'} = $collectcfg->{'saveas'};
320 } else {
321 $self->{'saveas'} = "GreenstoneXML"; # the default
322 }
323 }
324
325 if (!defined $self->{'OIDtype'}
326 || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/ )) {
327 # OIDtype was either not defined on the command-line, or if it was not one of the recognized values
328 if (defined $collectcfg->{'OIDtype'}
329 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
330 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
331 } else {
332 $self->{'OIDtype'} = "hash"; # the default
333 }
334 }
335
336 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
337 if (defined $collectcfg->{'OIDmetadata'}) {
338 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
339 } else {
340 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
341 }
342 }
343
344 my $sortmeta = $self->{'sortmeta'};
345 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
346 $sortmeta = $collectcfg->{'sortmeta'};
347 }
348 # sortmeta cannot be used with group size
349 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
350 if (defined $sortmeta && $self->{'groupsize'} > 1) {
351 &gsprintf($out, "{import.cannot_sort}\n\n");
352 $sortmeta = undef;
353 }
354 if (defined $sortmeta) {
355 &gsprintf($out, "{import.sortmeta_paired_with_ArchivesInfPlugin}\n\n");
356 }
357 $self->{'sortmeta'} = $sortmeta;
358
359 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
360 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
361 }
362
363 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
364 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
365 }
366 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
367 $self->{'debug'} = 1;
368 }
369 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
370 $self->{'gli'} = 1;
371 }
372 $self->{'gli'} = 0 unless defined $self->{'gli'};
373
374 # check keepold and removeold
375 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
376
377 my ($removeold, $keepold, $incremental, $incremental_mode)
378 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
379 $self->{'incremental'}, $checkdir,
380 $collectcfg);
381
382 $self->{'removeold'} = $removeold;
383 $self->{'keepold'} = $keepold;
384 $self->{'incremental'} = $incremental;
385 $self->{'incremental_mode'} = $incremental_mode;
386
387 # Since this wasted my morning, let's at least warn a user that manifest
388 # files now *only* work if keepold is set [jmt12]
389 if ($self->{'manifest'} && !$self->{'keepold'})
390 {
391 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
392 }
393}
394
395sub process_files
396{
397 my $self = shift @_;
398 my ($config_filename,$collectcfg) = @_;
399
400 my $inexport_mode = $self->{'mode'};
401
402 my $verbosity = $self->{'verbosity'};
403 my $debug = $self->{'debug'};
404
405 my $importdir = $self->{'importdir'};
406 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
407
408 my $incremental = $self->{'incremental'};
409 my $incremental_mode = $self->{'incremental_mode'};
410
411 my $gs_version = $self->{'gs_version'};
412
413 my $removeold = $self->{'removeold'};
414 my $keepold = $self->{'keepold'};
415
416 my $saveas = $self->{'saveas'};
417 my $OIDtype = $self->{'OIDtype'};
418 my $OIDmetadata = $self->{'OIDmetadata'};
419
420 my $out = $self->{'out'};
421 my $faillog = $self->{'faillog'};
422
423 my $maxdocs = $self->{'maxdocs'};
424 my $gzip = $self->{'gzip'};
425 my $groupsize = $self->{'groupsize'};
426 my $sortmeta = $self->{'sortmeta'};
427
428 my $removeprefix = $self->{'removeprefix'};
429 my $removesuffix = $self->{'removesuffix'};
430
431 my $gli = $self->{'gli'};
432
433 # related to export
434 my $xsltfile = $self->{'xsltfile'};
435 my $group_marc = $self->{'group_marc'};
436 my $mapping_file = $self->{'mapping_file'};
437 my $xslt_mets = $self->{'xslt_mets'};
438 my $xslt_txt = $self->{'xslt_txt'};
439 my $fedora_namespace = $self->{'fedora_namespace'};
440 my $metadata_prefix = $self->{'metadata_prefix'};
441
442 if ($inexport_mode eq "import") {
443 print STDERR "<Import>\n" if $gli;
444 }
445 else {
446 print STDERR "<export>\n" if $gli;
447 }
448
449 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
450 if ($self->{'manifest'} ne "") {
451 my $manifest_filename = $self->{'manifest'};
452
453 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
454 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
455 }
456
457 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
458 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
459 #$self->{'manifest'} =~ s/\/$//;
460
461 $manifest_lookup->parse($manifest_filename);
462
463 # manifests may now include a version number [jmt12]
464 $self->{'manifest_version'} = $manifest_lookup->get_version();
465 }
466
467 my $manifest = $self->{'manifest'};
468
469 # load all the plugins
470 my $plugins = [];
471 if (defined $collectcfg->{'plugin'}) {
472 $plugins = $collectcfg->{'plugin'};
473 }
474
475 my $plugin_incr_mode = $incremental_mode;
476 if ($manifest ne "") {
477 # if we have a manifest file, then we pretend we are fully incremental for plugins
478 $plugin_incr_mode = "all";
479 }
480 #some global options for the plugins
481 my @global_opts = ();
482
483 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
484 if (scalar(@$pluginfo) == 0) {
485 &gsprintf($out, "{import.no_plugins_loaded}\n");
486 die "\n";
487 }
488
489 # remove the old contents of the archives directory (and tmp
490 # directory) if needed
491
492 if ($removeold) {
493 if (&FileUtils::directoryExists($archivedir)) {
494 &gsprintf($out, "{import.removing_archives}\n");
495 &FileUtils::removeFilesRecursive($archivedir);
496 }
497 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
498 $tmpdir =~ s/[\\\/]+/\//g;
499 $tmpdir =~ s/\/$//;
500 if (&FileUtils::directoryExists($tmpdir)) {
501 &gsprintf($out, "{import.removing_tmpdir}\n");
502 &FileUtils::removeFilesRecursive($tmpdir);
503 }
504 }
505
506 # create the archives dir if needed
507 &FileUtils::makeAllDirectories($archivedir);
508
509 # read the archive information file
510
511 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
512 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
513 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
514
515 # When we make these initial calls to determine the archive information doc
516 # and src databases we pass through a '1' to indicate this is the first
517 # time we are referring to these databases. When using dynamic dbutils
518 # (available in extensions) this indicates to some database types (for
519 # example, persistent servers) that this is a good time to perform any
520 # one time initialization. The argument has no effect on vanilla dbutils
521 # [jmt12]
522 my $perform_firsttime_init = 1;
523 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
524 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
525
526 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
527 $archive_info->load_info ($arcinfo_doc_filename);
528
529 if ($manifest eq "") {
530 # Load in list of files in import folder from last import (if present)
531 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
532 }
533
534 ####Use Plugout####
535 my $plugout;
536
537 if ($inexport_mode eq "import") {
538 if (defined $collectcfg->{'plugout'}) {
539 # If a plugout was specified in the collect.cfg file, assume it is sensible
540 # We can't check the name because it could be anything, if it is a custom plugout
541 $plugout = $collectcfg->{'plugout'};
542 }
543 else{
544 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
545 push @$plugout,"GreenstoneXMLPlugout";
546 }
547 else{
548 push @$plugout,$saveas."Plugout";
549 }
550 }
551 }
552 else {
553 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
554 $plugout = $collectcfg->{'plugout'};
555 }
556 else{
557 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
558 push @$plugout,"GreenstoneMETSPlugout";
559 }
560 else{
561 push @$plugout,$saveas."Plugout";
562 }
563 }
564 }
565
566 my $plugout_name = $plugout->[0];
567
568 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
569 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
570 push @$plugout,("-debug") if ($debug);
571 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
572 push @$plugout,("-gzip_output") if ($gzip);
573 push @$plugout,("-output_handle",$out) if (defined $out);
574
575 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
576
577 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
578 push @$plugout,("-group") if ($group_marc);
579 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
580 }
581 if ($plugout_name =~ m/^.*METSPlugout$/) {
582 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
583 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
584 }
585
586 if ($plugout_name eq "FedoraMETSPlugout") {
587 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
588 }
589
590 if ($plugout_name eq "DSpacePlugout") {
591 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
592 }
593
594 my $processor = &plugout::load_plugout($plugout);
595 $processor->setoutputdir ($archivedir);
596 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
597
598 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
599
600 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
601
602 if ($removeold) {
603 # occasionally, plugins may want to do something on remove
604 # old, eg pharos image indexing
605 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
606 }
607
608 # process the import directory
609 my $block_hash = {};
610 $block_hash->{'new_files'} = {};
611 $block_hash->{'reindex_files'} = {};
612 # all of these are set somewhere else, so it's more readable to define them
613 # here [jmt12]
614 $block_hash->{'all_files'} = {};
615 $block_hash->{'deleted_files'} = {};
616 $block_hash->{'file_blocks'} = {};
617 $block_hash->{'metadata_files'} = {};
618 $block_hash->{'shared_fileroot'} = '';
619 # a new flag so we can tell we had a manifest way down in the plugins
620 # [jmt12]
621 $block_hash->{'manifest'} = 'false';
622 my $metadata = {};
623
624 # global blocking pass may set up some metadata
625 # - when we have a newer manifest file we don't do this -unless- the
626 # collection configuration indicates this collection contains complex
627 # (inherited) metadata [jmt12]
628 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
629 {
630 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
631 }
632 else
633 {
634 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
635 }
636
637 if ($manifest ne "") {
638
639 # mark that we are using a manifest - information that might be needed
640 # down in plugins (for instance DirectoryPlugin)
641 $block_hash->{'manifest'} = $self->{'manifest_version'};
642
643 #
644 # 1. Process delete files first
645 #
646 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
647 my @full_deleted_files = ();
648
649 # ensure all filenames are absolute
650 foreach my $df (@deleted_files) {
651 my $full_df =
652 (&FileUtils::isFilenameAbsolute($df))
653 ? $df
654 : &FileUtils::filenameConcatenate($importdir,$df);
655
656 if (-d $full_df) {
657 &add_dir_contents_to_list($full_df, \@full_deleted_files);
658 } else {
659 push(@full_deleted_files,$full_df);
660 }
661 }
662
663 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
664 mark_docs_for_deletion($archive_info,{},
665 \@full_deleted_files,
666 $archivedir, $verbosity, "delete");
667
668
669 #
670 # 2. Now files for reindexing
671 #
672
673 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
674 my @full_reindex_files = ();
675 # ensure all filenames are absolute
676 foreach my $rf (@reindex_files) {
677 my $full_rf =
678 (&FileUtils::isFilenameAbsolute($rf))
679 ? $rf
680 : &FileUtils::filenameConcatenate($importdir,$rf);
681
682 if (-d $full_rf) {
683 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
684 } else {
685 push(@full_reindex_files,$full_rf);
686 }
687 }
688
689 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
690 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
691
692 # And now to ensure the new version of the file processed by
693 # appropriate plugin, we need to add it to block_hash reindex list
694 foreach my $full_rf (@full_reindex_files) {
695 $block_hash->{'reindex_files'}->{$full_rf} = 1;
696 }
697
698
699 #
700 # 3. Now finally any new files - add to block_hash new_files list
701 #
702
703 my @new_files = keys %{$manifest_lookup->{'index'}};
704 my @full_new_files = ();
705
706 foreach my $nf (@new_files) {
707 # ensure filename is absolute
708 my $full_nf =
709 (&FileUtils::isFilenameAbsolute($nf))
710 ? $nf
711 : &FileUtils::filenameConcatenate($importdir,$nf);
712
713 if (-d $full_nf) {
714 &add_dir_contents_to_list($full_nf, \@full_new_files);
715 } else {
716 push(@full_new_files,$full_nf);
717 }
718 }
719
720 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
721 # need to check this file exists before trying to read it - in the past
722 # it wasn't possible to have a manifest unless keepold was also set so
723 # you were pretty much guarenteed arcinfo existed
724 # [jmt12]
725 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
726 if (-e $arcinfo_src_filename)
727 {
728 my $arcinfodb_map = {};
729 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
730 foreach my $f (@full_new_files) {
731 # check that we haven't seen it already
732 if (defined $arcinfodb_map->{$f}) {
733 # TODO make better warning
734 print STDERR "Warning: $f already in src archive, \n";
735 } else {
736 $block_hash->{'new_files'}->{$f} = 1;
737 }
738 }
739
740 undef $arcinfodb_map;
741 }
742 # no existing files - so we can just add all the files [jmt12]
743 else
744 {
745 foreach my $f (@full_new_files)
746 {
747 $block_hash->{'new_files'}->{$f} = 1;
748 }
749 }
750
751 # If we are not using complex inherited metadata (and thus have skipped
752 # the global file scan) we need to at least check for a matching
753 # metadata.xml for the files being indexed/reindexed
754 # - unless we are using the newer version of Manifests, which are treated
755 # verbatim, and should have a metadata element for metadata files (so
756 # we can explicitly process metadata files other than metadata.xml)
757 # [jmt12]
758 if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
759 {
760 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
761 foreach my $file_to_import (@all_files_to_import)
762 {
763 my $metadata_xml_path = $file_to_import;
764 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
765 if (&FileUtils::fileExists($metadata_xml_path))
766 {
767 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
768 }
769 }
770 }
771
772 # new version manifest files explicitly list metadata files to be
773 # processed (ignoring complexmeta if set)
774 # [jmt12]
775 if ($self->{'manifest_version'} > 1)
776 {
777 # Process metadata files
778 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
779 {
780 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
781 }
782 }
783 }
784 else {
785 # if incremental, we read through the import folder to see whats changed.
786
787 if ($incremental || $incremental_mode eq "onlyadd") {
788 prime_doc_oid_count($archivedir);
789
790 # Can now work out which files were new, already existed, and have
791 # been deleted
792
793 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
794 $archivedir,$verbosity,$incremental_mode);
795
796 my @new_files = sort keys %{$block_hash->{'new_files'}};
797 if (scalar(@new_files>0)) {
798 print STDERR "New files and modified metadata files since last import:\n ";
799 print STDERR join("\n ",@new_files), "\n";
800 }
801
802 if ($incremental) {
803 # only look for deletions if we are truely incremental
804 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
805 # Filter out any in gsdl/tmp area
806 my @filtered_deleted_files = ();
807 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
808 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
809 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
810 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
811
812 foreach my $df (@deleted_files) {
813 next if ($df =~ m/^$gsdl_tmp_area/);
814 next if ($df =~ m/^$collect_tmp_area/);
815
816 push(@filtered_deleted_files,$df);
817 }
818
819
820 @deleted_files = @filtered_deleted_files;
821
822 if (scalar(@deleted_files)>0) {
823 print STDERR "Files deleted since last import:\n ";
824 print STDERR join("\n ",@deleted_files), "\n";
825
826
827 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
828
829 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
830 }
831
832 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
833
834 if (scalar(@reindex_files)>0) {
835 print STDERR "Files to reindex since last import:\n ";
836 print STDERR join("\n ",@reindex_files), "\n";
837 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
838 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
839 }
840
841 }
842 }
843 }
844
845 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
846 # Do nothing if the file already exists (file exists on incremental build).
847 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
848 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
849 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
850 # oailastmodified and oailastmodifieddate
851 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
852 if (!-f $earliestDatestampFile && -d $archivedir) {
853 my $current_time_in_seconds = time; # in seconds
854
855 if(open(FOUT, ">$earliestDatestampFile")) {
856 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
857 print FOUT $current_time_in_seconds;
858 close(FOUT);
859 }
860 else {
861 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
862 }
863
864 }
865
866
867 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
868
869 if ($saveas eq "FedoraMETS") {
870 # create collection "doc obj" for Fedora that contains
871 # collection-level metadata
872
873 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
874 $doc_obj->set_OID("collection");
875
876 my $col_name = undef;
877 my $col_meta = $collectcfg->{'collectionmeta'};
878
879 if (defined $col_meta) {
880 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
881 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
882 }
883 $processor->process($doc_obj);
884 }
885
886 &plugin::end($pluginfo, $processor);
887
888 &plugin::deinit($pluginfo, $processor);
889
890 # Store the value of OIDCount (used in doc.pm) so it can be
891 # restored correctly to this value on an incremental build
892 # - this OIDcount file should only be generated for numerical oids [jmt12]
893 if ($self->{'OIDtype'} eq 'incremental')
894 {
895 store_doc_oid_count($archivedir);
896 }
897
898 # write out the archive information file
899 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
900 $processor->close_group_output() if $processor->is_group();
901
902 # for backwards compatability with archvies.inf file
903 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
904 $archive_info->save_info($arcinfo_doc_filename);
905 }
906 else {
907 $archive_info->save_revinfo_db($arcinfo_src_filename);
908 }
909
910 return $pluginfo;
911}
912
913# @function perform_process_files()
914# while process_files() above prepares the system to import files this is the
915# function that actually initiates the plugin pipeline to process the files.
916# This function the therefore be overridden in subclasses of inexport.pm should
917# they wish to do different or further processing
918# @author jmt12
919sub perform_process_files
920{
921 my $self = shift(@_);
922 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
923 my $gli = $self->{'gli'};
924 # specific file to process - via manifest version 2+
925 if ($file_to_import ne '')
926 {
927 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
928 }
929 # global file scan - if we are using a new version manifest, files would have
930 # been read above. Older manifests use extra settings in the $block_hash to
931 # control what is imported, while non-manifest imports use a regular
932 # $block_hash (so obeying process_exp and block_exp) [jmt12]
933 elsif ($manifest eq '' || $self->{'manifest_version'} < 1)
934 {
935 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
936 }
937 else
938 {
939 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
940 }
941}
942# perform_process_files()
943
944# @function generate_statistics()
945sub generate_statistics
946{
947 my $self = shift @_;
948 my ($pluginfo) = @_;
949
950 my $inexport_mode = $self->{'mode'};
951 my $out = $self->{'out'};
952 my $faillogname = $self->{'faillogname'};
953 my $gli = $self->{'gli'};
954
955 &gsprintf($out, "\n");
956 &gsprintf($out, "*********************************************\n");
957 &gsprintf($out, "{$inexport_mode.complete}\n");
958 &gsprintf($out, "*********************************************\n");
959
960 &plugin::write_stats($pluginfo, 'STDERR', $faillogname, $gli);
961}
962# generate_statistics()
963
964
965# @function deinit()
966# Close down any file handles that we opened (and hence are responsible for
967# closing
968sub deinit
969{
970 my $self = shift(@_);
971 close OUT if $self->{'close_out'};
972 close FAILLOG if $self->{'close_faillog'};
973}
974# deinit()
975
976
977sub store_collectionmeta
978{
979 my ($collectionmeta,$field,$doc_obj) = @_;
980
981 my $section = $doc_obj->get_top_section();
982
983 my $field_hash = $collectionmeta->{$field};
984
985 foreach my $k (keys %$field_hash)
986 {
987 my $val = $field_hash->{$k};
988
989 ### print STDERR "*** $k = $field_hash->{$k}\n";
990
991 my $md_label = "ex.$field";
992
993
994 if ($k =~ m/^\[l=(.*?)\]$/)
995 {
996
997 my $md_suffix = $1;
998 $md_label .= "^$md_suffix";
999 }
1000
1001
1002 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1003
1004 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1005 # while "collectionname" in GS2 is called "name" in GS3.
1006 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1007 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1008 {
1009 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1010 }
1011
1012 }
1013}
1014
1015
1016sub oid_count_file {
1017 my ($archivedir) = @_;
1018 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1019}
1020
1021
1022sub prime_doc_oid_count
1023{
1024 my ($archivedir) = @_;
1025 my $oid_count_filename = &oid_count_file($archivedir);
1026
1027 if (-e $oid_count_filename) {
1028 if (open(OIDIN,"<$oid_count_filename")) {
1029 my $OIDcount = <OIDIN>;
1030 chomp $OIDcount;
1031 close(OIDIN);
1032
1033 $doc::OIDcount = $OIDcount;
1034 }
1035 else {
1036 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1037 }
1038 }
1039
1040}
1041
1042sub store_doc_oid_count
1043{
1044 # Use the file "OIDcount" in the archives directory to record
1045 # what value doc.pm got up to
1046
1047 my ($archivedir) = @_;
1048 my $oid_count_filename = &oid_count_file($archivedir);
1049
1050 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1051 if (open(OIDOUT,">$oid_count_filename")) {
1052 print OIDOUT $doc::OIDcount, "\n";
1053
1054 close(OIDOUT);
1055 }
1056 else {
1057 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1058 }
1059}
1060
1061
1062
1063sub new_vs_old_import_diff
1064{
1065 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1066
1067 # Get the infodbtype value for this collection from the arcinfo object
1068 my $infodbtype = $archive_info->{'infodbtype'};
1069
1070 # in this method, we want to know if metadata files are modified or not.
1071 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1072
1073 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1074
1075 # First convert all files to absolute form
1076 # This is to support the situation where the import folder is not
1077 # the default
1078
1079 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1080 my $full_prev_all_files = {};
1081
1082 foreach my $prev_file (keys %$prev_all_files) {
1083
1084 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1085 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1086 $full_prev_all_files->{$full_prev_file} = $prev_file;
1087 }
1088 else {
1089 $full_prev_all_files->{$prev_file} = $prev_file;
1090 }
1091 }
1092
1093
1094 # Figure out which are the new files, existing files and so
1095 # by implication the files from the previous import that are not
1096 # there any more => mark them for deletion
1097 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1098
1099 my $full_curr_file = $curr_file;
1100
1101 # entry in 'all_files' is moved to either 'existing_files',
1102 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1103
1104 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1105 # add in import dir to make absolute
1106 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1107 }
1108
1109 # figure out if new file or not
1110 if (defined $full_prev_all_files->{$full_curr_file}) {
1111 # delete it so that only files that need deleting are left
1112 delete $full_prev_all_files->{$full_curr_file};
1113
1114 # had it before. is it a metadata file?
1115 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1116
1117 # is it modified??
1118 if (-M $full_curr_file < $archiveinf_timestamp) {
1119 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1120 # its newer than last build
1121 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1122 }
1123 }
1124 else {
1125 if ($incremental_mode eq "all") {
1126
1127 # had it before
1128 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1129
1130 }
1131 else {
1132 # Warning in "onlyadd" mode, but had it before!
1133 print STDERR "Warning: File $full_curr_file previously imported.\n";
1134 print STDERR " Treating as new file\n";
1135
1136 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1137
1138 }
1139 }
1140 }
1141 else {
1142 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1143 # the new file is the special sort of file greenstone uses
1144 # to attach metadata to src documents
1145 # i.e metadata.xml
1146 # (but note, the filename used is not constrained in
1147 # Greenstone to always be this)
1148
1149 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1150 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1151 }
1152 else {
1153 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1154 }
1155 }
1156
1157
1158 delete $block_hash->{'all_files'}->{$curr_file};
1159 }
1160
1161
1162
1163
1164 # Deal with complication of new or modified metadata files by forcing
1165 # everything from this point down in the file hierarchy to
1166 # be freshly imported.
1167 #
1168 # This may mean files that have not changed are reindexed, but does
1169 # guarantee by the end of processing all new metadata is correctly
1170 # associated with the relevant document(s).
1171
1172 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1173 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1174
1175 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1176 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1177
1178 # Go through existing_files, and mark anything that is contained
1179 # within 'situated_dir' to be reindexed (in case some of the metadata
1180 # attaches to one of these files)
1181
1182 my $reindex_files = [];
1183
1184 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1185
1186 if ($existing_f =~ m/^$situated_dir/) {
1187
1188 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1189
1190 push(@$reindex_files,$existing_f);
1191 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1192 delete $block_hash->{'existing_files'}->{$existing_f};
1193
1194 }
1195 }
1196
1197 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1198 # (or equivalent)
1199 $block_hash->{'new_files'}->{$new_mdf} = 1;
1200
1201 }
1202
1203 # go through remaining existing files and work out what has changed and needs to be reindexed.
1204 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1205
1206 my $reindex_files = [];
1207
1208 foreach my $existing_filename (@existing_files) {
1209 if (-M $existing_filename < $archiveinf_timestamp) {
1210 # file is newer than last build
1211
1212 my $existing_file = $existing_filename;
1213 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1214
1215 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1216 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1217
1218 print STDERR "**** Reindexing existing file: $existing_file\n";
1219
1220 push(@$reindex_files,$existing_file);
1221 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1222 }
1223
1224 }
1225
1226
1227 # By this point full_prev_all_files contains the files
1228 # mentioned in archiveinf-src.db but are not in the 'import'
1229 # folder (or whatever was specified through -importdir ...)
1230
1231 # This list can contain files that were created in the 'tmp' or
1232 # 'cache' areas (such as screen-size and thumbnail images).
1233 #
1234 # In building the final list of files to delete, we test to see if
1235 # it exists on the filesystem and if it does (unusual for a "normal"
1236 # file in import, but possible in the case of 'tmp' files),
1237 # supress it from going into the final list
1238
1239 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1240
1241 my @deleted_files = values %$full_prev_all_files;
1242 map { my $curr_file = $_;
1243 my $full_curr_file = $curr_file;
1244
1245 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1246 # add in import dir to make absolute
1247
1248 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1249 }
1250
1251
1252 if (!-e $full_curr_file) {
1253 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1254 }
1255 } @deleted_files;
1256
1257
1258
1259}
1260
1261
1262# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1263# $mode is 'delete' or 'reindex'
1264sub mark_docs_for_deletion
1265{
1266 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1267
1268 my $mode_text = "deleted from index";
1269 if ($mode eq "reindex") {
1270 $mode_text = "reindexed";
1271 }
1272
1273 # Get the infodbtype value for this collection from the arcinfo object
1274 my $infodbtype = $archive_info->{'infodbtype'};
1275
1276 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1277 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1278
1279
1280 # record files marked for deletion in arcinfo
1281 foreach my $file (@$deleted_files) {
1282 # use 'archiveinf-src' info database file to look up all the OIDs
1283 # that this file is used in (note in most cases, it's just one OID)
1284
1285 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1286 my $oids = $src_rec->{'oid'};
1287 my $file_record_deleted = 0;
1288
1289 # delete the src record
1290 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1291 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1292 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1293
1294
1295 foreach my $oid (@$oids) {
1296
1297 # find the source doc (the primary file that becomes this oid)
1298 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1299 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1300 if (!&util::filename_is_absolute($doc_source_file)) {
1301 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1302 }
1303
1304 if ($doc_source_file ne $file) {
1305 # its an associated or metadata file
1306
1307 # mark source doc for reimport as one of its assoc files has changed or deleted
1308 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1309
1310 }
1311 my $curr_status = $archive_info->get_status_info($oid);
1312 if (defined($curr_status) && (($curr_status ne "D"))) {
1313 if ($verbosity>1) {
1314 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1315 }
1316 # mark oid for deletion (it will be deleted or reimported)
1317 $archive_info->set_status_info($oid,"D");
1318 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1319 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1320
1321 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1322 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1323
1324 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1325 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1326 }
1327 }
1328
1329 }
1330
1331 # now go through and check that we haven't marked any primary
1332 # files for reindex (because their associated files have
1333 # changed/deleted) when they have been deleted themselves. only in
1334 # delete mode.
1335
1336 if ($mode eq "delete") {
1337 foreach my $file (@$deleted_files) {
1338 if (defined $block_hash->{'reindex_files'}->{$file}) {
1339 delete $block_hash->{'reindex_files'}->{$file};
1340 }
1341 }
1342 }
1343
1344
1345}
1346
1347sub add_dir_contents_to_list {
1348
1349 my ($dirname, $list) = @_;
1350
1351 # Recur over directory contents.
1352 my (@dir, $subfile);
1353
1354 # find all the files in the directory
1355 if (!opendir (DIR, $dirname)) {
1356 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1357 return -1; # error in processing
1358 }
1359 @dir = readdir (DIR);
1360 closedir (DIR);
1361
1362 for (my $i = 0; $i < scalar(@dir); $i++) {
1363 my $subfile = $dir[$i];
1364 next if ($subfile =~ m/^\.\.?$/);
1365 next if ($subfile =~ /^\.svn$/);
1366 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1367 if (-d $full_file) {
1368 &add_dir_contents_to_list($full_file, $list);
1369 } else {
1370 push (@$list, $full_file);
1371 }
1372 }
1373
1374}
1375
1376
13771;
Note: See TracBrowser for help on using the repository browser.