source: main/trunk/greenstone2/perllib/inexport.pm@ 27302

Last change on this file since 27302 was 27302, checked in by jmt12, 11 years ago

Removed parallel processing stuff as that now lives in an extension. Restructured to better support overriding by extensions. Checks for manifest version, and processes files accordingly. Conditional addition to INC and PATH environment variables (explained elsewhere). Replace deprecated util.pm calls with FileUtils.pm ones

  • Property svn:executable set to *
File size: 44.2 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 # Unless otherwise stated all manifests are considered version 1---where
122 # they act more like an advanced process expression---as compared to newer
123 # manifest files that act as an explicit (and exhaustive) list of files to
124 # process [jmt12]
125 $self->{'manifest_version'} = 1;
126
127 return bless $self, $class;
128}
129
130# Simplified version of the contstructor for use with CGI scripts
131sub newCGI
132{
133 my $class = shift (@_);
134 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
135
136 my $self = { 'xml' => 0, 'mode' => $mode };
137
138 $self->{'out'} = STDERR;
139
140 if (defined $gsdl_cgi) {
141 $self->{'site'} = $opt_site;
142 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
143 $self->{'collectdir'} = $collect_dir;
144 }
145 else {
146 $self->{'site'} = "";
147 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
148 }
149 $self->{'faillog'} = "";
150
151 $self->{'collection'} = $collect;
152
153 return bless $self, $class;
154}
155sub get_collection
156{
157 my $self = shift @_;
158
159 return $self->{'collection'};
160}
161
162
163sub read_collection_cfg
164{
165 my $self = shift @_;
166 my ($collection,$options) = @_;
167
168 my $collectdir = $self->{'collectdir'};
169 my $site = $self->{'site'};
170 my $out = $self->{'out'};
171
172 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
173 &PrintUsage::print_txt_usage($options, "{import.params}");
174 die "\n";
175 }
176
177 # set gs_version 2/3
178 $self->{'gs_version'} = "2";
179 if ((defined $site) && ($site ne "")) {
180 # gs3
181 $self->{'gs_version'} = "3";
182 }
183
184 # add collection's perllib dir into include path in
185 # case we have collection specific modules
186 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
187
188 # check that we can open the faillog
189 my $faillog = $self->{'faillog'};
190 if ($faillog eq "") {
191 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
192 }
193 open (FAILLOG, ">$faillog") ||
194 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
195
196
197 my $faillogname = $faillog;
198 $faillog = 'inexport::FAILLOG';
199 $faillog->autoflush(1);
200 $self->{'faillog'} = $faillog;
201 $self->{'faillogname'} = $faillogname;
202 $self->{'close_faillog'} = 1;
203
204 # Read in the collection configuration file.
205 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
206 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
207 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
208
209 return ($config_filename,$collectcfg);
210}
211
212sub set_collection_options
213{
214 my $self = shift @_;
215 my ($collectcfg) = @_;
216
217 my $inexport_mode = $self->{'mode'};
218
219 my $verbosity = $self->{'verbosity'};
220 my $debug = $self->{'debug'};
221 my $importdir = $self->{'importdir'};
222 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
223 my $out = $self->{'out'};
224
225 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
226 if (!defined($collectcfg->{'infodbtype'}))
227 {
228 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
229 }
230 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
231 # we can't use the text version for archives dbs.
232 $collectcfg->{'infodbtype'} = "gdbm";
233 }
234
235 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
236 $importdir = $collectcfg->{'importdir'};
237 }
238 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
239 $archivedir = $collectcfg->{'archivedir'};
240 }
241 # fill in the default import and archives directories if none
242 # were supplied, turn all \ into / and remove trailing /
243 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
244 # @todo &FileUtils::sanitizePath($importdir) [jmt12]
245 $importdir =~ s/[\\\/]+/\//g;
246 $importdir =~ s/\/$//;
247 if (!-e $importdir) {
248 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
249 die "\n";
250 }
251 $self->{'importdir'} = $importdir;
252
253 if ($archivedir eq "") {
254 if ($inexport_mode eq "import") {
255 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
256 }
257 elsif ($inexport_mode eq "export") {
258 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
259 }
260 else {
261 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
262 print STDERR " Defaulting to 'archives' for file output\n";
263 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
264 }
265 }
266
267 # @todo &FileUtils::sanitizePath($archivedir) [jmt12]
268 $archivedir =~ s/[\\\/]+/\//g;
269 $archivedir =~ s/\/$//;
270 $self->{'archivedir'} = $archivedir;
271
272 if ($verbosity !~ /\d+/) {
273 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
274 $verbosity = $collectcfg->{'verbosity'};
275 } else {
276 $verbosity = 2; # the default
277 }
278 }
279 $self->{'verbosity'} = $verbosity;
280
281 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
282 $self->{'manifest'} = $collectcfg->{'manifest'};
283 }
284
285 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
286 if ($collectcfg->{'gzip'} =~ /^true$/i) {
287 $self->{'gzip'} = 1;
288 }
289 }
290
291 if ($self->{'maxdocs'} !~ /\-?\d+/) {
292 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
293 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
294 } else {
295 $self->{'maxdocs'} = -1; # the default
296 }
297 }
298
299 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
300 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
301 $self->{'groupsize'} = $collectcfg->{'groupsize'};
302 }
303 }
304
305 if (!defined $self->{'OIDtype'}
306 || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/ )) {
307 # OIDtype was either not defined on the command-line, or if it was not one of the recognized values
308 if (defined $collectcfg->{'OIDtype'}
309 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/) {
310 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
311 } else {
312 $self->{'OIDtype'} = "hash"; # the default
313 }
314 }
315
316 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
317 if (defined $collectcfg->{'OIDmetadata'}) {
318 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
319 } else {
320 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
321 }
322 }
323
324 my $sortmeta = $self->{'sortmeta'};
325 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
326 $sortmeta = $collectcfg->{'sortmeta'};
327 }
328 # sortmeta cannot be used with group size
329 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
330 if (defined $sortmeta && $self->{'groupsize'} > 1) {
331 &gsprintf($out, "{import.cannot_sort}\n\n");
332 $sortmeta = undef;
333 }
334 $self->{'sortmeta'} = $sortmeta;
335
336 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
337 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
338 }
339
340 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
341 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
342 }
343 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
344 $self->{'debug'} = 1;
345 }
346 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
347 $self->{'gli'} = 1;
348 }
349 $self->{'gli'} = 0 unless defined $self->{'gli'};
350
351 # check keepold and removeold
352 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
353
354 my ($removeold, $keepold, $incremental, $incremental_mode)
355 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
356 $self->{'incremental'}, $checkdir,
357 $collectcfg);
358
359 $self->{'removeold'} = $removeold;
360 $self->{'keepold'} = $keepold;
361 $self->{'incremental'} = $incremental;
362 $self->{'incremental_mode'} = $incremental_mode;
363
364 # Since this wasted my morning, let's at least warn a user that manifest
365 # files now *only* work if keepold is set [jmt12]
366 if ($self->{'manifest'} && !$self->{'keepold'})
367 {
368 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
369 }
370}
371
372sub process_files
373{
374 my $self = shift @_;
375 my ($config_filename,$collectcfg) = @_;
376
377 my $inexport_mode = $self->{'mode'};
378
379 my $verbosity = $self->{'verbosity'};
380 my $debug = $self->{'debug'};
381
382 my $importdir = $self->{'importdir'};
383 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
384
385 my $incremental = $self->{'incremental'};
386 my $incremental_mode = $self->{'incremental_mode'};
387
388 my $gs_version = $self->{'gs_version'};
389
390 my $removeold = $self->{'removeold'};
391 my $keepold = $self->{'keepold'};
392
393 my $saveas = $self->{'saveas'};
394 my $OIDtype = $self->{'OIDtype'};
395 my $OIDmetadata = $self->{'OIDmetadata'};
396
397 my $out = $self->{'out'};
398 my $faillog = $self->{'faillog'};
399
400 my $maxdocs = $self->{'maxdocs'};
401 my $gzip = $self->{'gzip'};
402 my $groupsize = $self->{'groupsize'};
403 my $sortmeta = $self->{'sortmeta'};
404
405 my $removeprefix = $self->{'removeprefix'};
406 my $removesuffix = $self->{'removesuffix'};
407
408 my $gli = $self->{'gli'};
409
410 # related to export
411 my $xsltfile = $self->{'xsltfile'};
412 my $group_marc = $self->{'group_marc'};
413 my $mapping_file = $self->{'mapping_file'};
414 my $xslt_mets = $self->{'xslt_mets'};
415 my $xslt_txt = $self->{'xslt_txt'};
416 my $fedora_namespace = $self->{'fedora_namespace'};
417 my $metadata_prefix = $self->{'metadata_prefix'};
418
419 if ($inexport_mode eq "import") {
420 print STDERR "<Import>\n" if $gli;
421 }
422 else {
423 print STDERR "<export>\n" if $gli;
424 }
425
426 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
427 if ($self->{'manifest'} ne "") {
428 my $manifest_filename = $self->{'manifest'};
429
430 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
431 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
432 }
433
434 $self->{'manifest'} =~ s/[\\\/]+/\//g;
435 $self->{'manifest'} =~ s/\/$//;
436
437 $manifest_lookup->parse($manifest_filename);
438
439 # manifests may now include a version number [jmt12]
440 $self->{'manifest_version'} = $manifest_lookup->get_version();
441 }
442
443 my $manifest = $self->{'manifest'};
444
445 # load all the plugins
446 my $plugins = [];
447 if (defined $collectcfg->{'plugin'}) {
448 $plugins = $collectcfg->{'plugin'};
449 }
450
451 my $plugin_incr_mode = $incremental_mode;
452 if ($manifest ne "") {
453 # if we have a manifest file, then we pretend we are fully incremental for plugins
454 $plugin_incr_mode = "all";
455 }
456 #some global options for the plugins
457 my @global_opts = ();
458
459 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
460 if (scalar(@$pluginfo) == 0) {
461 &gsprintf($out, "{import.no_plugins_loaded}\n");
462 die "\n";
463 }
464
465 # remove the old contents of the archives directory (and tmp
466 # directory) if needed
467
468 if ($removeold) {
469 if (&FileUtils::directoryExists($archivedir)) {
470 &gsprintf($out, "{import.removing_archives}\n");
471 &FileUtils::removeFilesRecursive($archivedir);
472 }
473 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
474 $tmpdir =~ s/[\\\/]+/\//g;
475 $tmpdir =~ s/\/$//;
476 if (&FileUtils::directoryExists($tmpdir)) {
477 &gsprintf($out, "{import.removing_tmpdir}\n");
478 &FileUtils::removeFileRecursive($tmpdir);
479 }
480 }
481
482 # create the archives dir if needed
483 &FileUtils::makeAllDirectories($archivedir);
484
485 # read the archive information file
486
487 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
488 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
489 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
490
491 # When we make these initial calls to determine the archive information doc
492 # and src databases we pass through a '1' to indicate this is the first
493 # time we are referring to these databases. When using dynamic dbutils
494 # (available in extensions) this indicates to some database types (for
495 # example, persistent servers) that this is a good time to perform any
496 # one time initialization. The argument has no effect on vanilla dbutils
497 # [jmt12]
498 my $perform_firsttime_init = 1;
499 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
500 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
501
502 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
503 $archive_info->load_info ($arcinfo_doc_filename);
504
505 if ($manifest eq "") {
506 # Load in list of files in import folder from last import (if present)
507 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
508 }
509
510 ####Use Plugout####
511 my $plugout;
512
513 if ($inexport_mode eq "import") {
514 if (defined $collectcfg->{'plugout'}) {
515 # If a plugout was specified in the collect.cfg file, assume it is sensible
516 # We can't check the name because it could be anything, if it is a custom plugout
517 $plugout = $collectcfg->{'plugout'};
518 }
519 else{
520 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
521 push @$plugout,"GreenstoneXMLPlugout";
522 }
523 else{
524 push @$plugout,$saveas."Plugout";
525 }
526 }
527 }
528 else {
529 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
530 $plugout = $collectcfg->{'plugout'};
531 }
532 else{
533 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
534 push @$plugout,"GreenstoneMETSPlugout";
535 }
536 else{
537 push @$plugout,$saveas."Plugout";
538 }
539 }
540 }
541
542 my $plugout_name = $plugout->[0];
543
544 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
545 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
546 push @$plugout,("-debug") if ($debug);
547 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
548 push @$plugout,("-gzip_output") if ($gzip);
549 push @$plugout,("-output_handle",$out) if (defined $out);
550
551 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
552
553 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
554 push @$plugout,("-group") if ($group_marc);
555 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
556 }
557 if ($plugout_name =~ m/^.*METSPlugout$/) {
558 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
559 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
560 }
561
562 if ($plugout_name eq "FedoraMETSPlugout") {
563 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
564 }
565
566 if ($plugout_name eq "DSpacePlugout") {
567 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
568 }
569
570 my $processor = &plugout::load_plugout($plugout);
571 $processor->setoutputdir ($archivedir);
572 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
573
574 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
575
576 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
577
578 if ($removeold) {
579 # occasionally, plugins may want to do something on remove
580 # old, eg pharos image indexing
581 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
582 }
583
584 # process the import directory
585 my $block_hash = {};
586 $block_hash->{'new_files'} = {};
587 $block_hash->{'reindex_files'} = {};
588 # all of these are set somewhere else, so it's more readable to define them
589 # here [jmt12]
590 $block_hash->{'all_files'} = {};
591 $block_hash->{'deleted_files'} = {};
592 $block_hash->{'file_blocks'} = {};
593 $block_hash->{'metadata_files'} = {};
594 $block_hash->{'shared_fileroot'} = '';
595 # a new flag so we can tell we had a manifest way down in the plugins
596 # [jmt12]
597 $block_hash->{'manifest'} = 'false';
598 my $metadata = {};
599
600 # global blocking pass may set up some metadata
601 # - when we have a newer manifest file we don't do this -unless- the
602 # collection configuration indicates this collection contains complex
603 # (inherited) metadata [jmt12]
604 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
605 {
606 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
607 }
608 else
609 {
610 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
611 }
612
613 if ($manifest ne "") {
614
615 # mark that we are using a manifest - information that might be needed
616 # down in plugins (for instance DirectoryPlugin)
617 $block_hash->{'manifest'} = $self->{'manifest_version'};
618
619 #
620 # 1. Process delete files first
621 #
622 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
623 my @full_deleted_files = ();
624
625 # ensure all filenames are absolute
626 foreach my $df (@deleted_files) {
627 my $full_df =
628 (&FileUtils::isFilenameAbsolute($df))
629 ? $df
630 : &FileUtils::filenameConcatenate($importdir,$df);
631
632 if (-d $full_df) {
633 &add_dir_contents_to_list($full_df, \@full_deleted_files);
634 } else {
635 push(@full_deleted_files,$full_df);
636 }
637 }
638
639 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
640 mark_docs_for_deletion($archive_info,{},
641 \@full_deleted_files,
642 $archivedir, $verbosity, "delete");
643
644
645 #
646 # 2. Now files for reindexing
647 #
648
649 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
650 my @full_reindex_files = ();
651 # ensure all filenames are absolute
652 foreach my $rf (@reindex_files) {
653 my $full_rf =
654 (&FileUtils::isFilenameAbsolute($rf))
655 ? $rf
656 : &FileUtils::filenameConcatenate($importdir,$rf);
657
658 if (-d $full_rf) {
659 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
660 } else {
661 push(@full_reindex_files,$full_rf);
662 }
663 }
664
665 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
666 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
667
668 # And now to ensure the new version of the file processed by
669 # appropriate plugin, we need to add it to block_hash reindex list
670 foreach my $full_rf (@full_reindex_files) {
671 $block_hash->{'reindex_files'}->{$full_rf} = 1;
672 }
673
674
675 #
676 # 3. Now finally any new files - add to block_hash new_files list
677 #
678
679 my @new_files = keys %{$manifest_lookup->{'index'}};
680 my @full_new_files = ();
681
682 foreach my $nf (@new_files) {
683 # ensure filename is absolute
684 my $full_nf =
685 (&FileUtils::isFilenameAbsolute($nf))
686 ? $nf
687 : &FileUtils::filenameConcatenate($importdir,$nf);
688
689 if (-d $full_nf) {
690 &add_dir_contents_to_list($full_nf, \@full_new_files);
691 } else {
692 push(@full_new_files,$full_nf);
693 }
694 }
695
696 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
697 # need to check this file exists before trying to read it - in the past
698 # it wasn't possible to have a manifest unless keepold was also set so
699 # you were pretty much guarenteed arcinfo existed
700 # [jmt12]
701 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
702 if (-e $arcinfo_src_filename)
703 {
704 my $arcinfodb_map = {};
705 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
706 foreach my $f (@full_new_files) {
707 # check that we haven't seen it already
708 if (defined $arcinfodb_map->{$f}) {
709 # TODO make better warning
710 print STDERR "Warning: $f already in src archive, \n";
711 } else {
712 $block_hash->{'new_files'}->{$f} = 1;
713 }
714 }
715
716 undef $arcinfodb_map;
717 }
718 # no existing files - so we can just add all the files [jmt12]
719 else
720 {
721 foreach my $f (@full_new_files)
722 {
723 $block_hash->{'new_files'}->{$f} = 1;
724 }
725 }
726
727 # If we are not using complex inherited metadata (and thus have skipped
728 # the global file scan) we need to at least check for a matching
729 # metadata.xml for the files being indexed/reindexed
730 # - unless we are using the newer version of Manifests, which are treated
731 # verbatim, and should have a metadata element for metadata files (so
732 # we can explicitly process metadata files other than metadata.xml)
733 # [jmt12]
734 if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
735 {
736 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
737 foreach my $file_to_import (@all_files_to_import)
738 {
739 my $metadata_xml_path = $file_to_import;
740 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
741 if (&FileUtils::fileExists($metadata_xml_path))
742 {
743 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
744 }
745 }
746 }
747
748 # new version manifest files explicitly list metadata files to be
749 # processed (ignoring complexmeta if set)
750 # [jmt12]
751 if ($self->{'manifest_version'} > 1)
752 {
753 # Process metadata files
754 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
755 {
756 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
757 }
758 }
759 }
760 else {
761 # if incremental, we read through the import folder to see whats changed.
762
763 if ($incremental || $incremental_mode eq "onlyadd") {
764 prime_doc_oid_count($archivedir);
765
766 # Can now work out which files were new, already existed, and have
767 # been deleted
768
769 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
770 $archivedir,$verbosity,$incremental_mode);
771
772 my @new_files = sort keys %{$block_hash->{'new_files'}};
773 if (scalar(@new_files>0)) {
774 print STDERR "New files and modified metadata files since last import:\n ";
775 print STDERR join("\n ",@new_files), "\n";
776 }
777
778 if ($incremental) {
779 # only look for deletions if we are truely incremental
780 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
781 # Filter out any in gsdl/tmp area
782 my @filtered_deleted_files = ();
783 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
784 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
785 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
786 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
787
788 foreach my $df (@deleted_files) {
789 next if ($df =~ m/^$gsdl_tmp_area/);
790 next if ($df =~ m/^$collect_tmp_area/);
791
792 push(@filtered_deleted_files,$df);
793 }
794
795
796 @deleted_files = @filtered_deleted_files;
797
798 if (scalar(@deleted_files)>0) {
799 print STDERR "Files deleted since last import:\n ";
800 print STDERR join("\n ",@deleted_files), "\n";
801
802
803 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
804
805 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
806 }
807
808 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
809
810 if (scalar(@reindex_files)>0) {
811 print STDERR "Files to reindex since last import:\n ";
812 print STDERR join("\n ",@reindex_files), "\n";
813 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
814 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
815 }
816
817 }
818 }
819 }
820
821 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
822 # Do nothing if the file already exists (file exists on incremental build).
823 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
824 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
825 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
826 # oailastmodified and oailastmodifieddate
827 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
828 if (!-f $earliestDatestampFile && -d $archivedir) {
829 my $current_time_in_seconds = time; # in seconds
830
831 if(open(FOUT, ">$earliestDatestampFile")) {
832 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
833 print FOUT $current_time_in_seconds;
834 close(FOUT);
835 }
836 else {
837 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
838 }
839
840 }
841
842
843 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
844
845 if ($saveas eq "FedoraMETS") {
846 # create collection "doc obj" for Fedora that contains
847 # collection-level metadata
848
849 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
850 $doc_obj->set_OID("collection");
851
852 my $col_name = undef;
853 my $col_meta = $collectcfg->{'collectionmeta'};
854
855 if (defined $col_meta) {
856 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
857 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
858 }
859 $processor->process($doc_obj);
860 }
861
862 &plugin::end($pluginfo, $processor);
863
864 &plugin::deinit($pluginfo, $processor);
865
866 # Store the value of OIDCount (used in doc.pm) so it can be
867 # restored correctly to this value on an incremental build
868 # - this OIDcount file should only be generated for numerical oids [jmt12]
869 if ($self->{'OIDtype'} eq 'incremental')
870 {
871 store_doc_oid_count($archivedir);
872 }
873
874 # write out the archive information file
875 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
876 $processor->close_group_output() if $processor->is_group();
877
878 # for backwards compatability with archvies.inf file
879 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
880 $archive_info->save_info($arcinfo_doc_filename);
881 }
882 else {
883 $archive_info->save_revinfo_db($arcinfo_src_filename);
884 }
885
886 return $pluginfo;
887}
888
889# @function perform_process_files()
890# while process_files() above prepares the system to import files this is the
891# function that actually initiates the plugin pipeline to process the files.
892# This function the therefore be overridden in subclasses of inexport.pm should
893# they wish to do different or further processing
894# @author jmt12
895sub perform_process_files
896{
897 my $self = shift(@_);
898 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
899 my $gli = $self->{'gli'};
900 # specific file to process - via manifest version 2+
901 if ($file_to_import ne '')
902 {
903 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
904 }
905 # global file scan - if we are using a new version manifest, files would have
906 # been read above. Older manifests use extra settings in the $block_hash to
907 # control what is imported, while non-manifest imports use a regular
908 # $block_hash (so obeying process_exp and block_exp) [jmt12]
909 elsif ($manifest eq '' || $self->{'manifest_version'} < 1)
910 {
911 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
912 }
913 else
914 {
915 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
916 }
917}
918# perform_process_files()
919
920# @function generate_statistics()
921sub generate_statistics
922{
923 my $self = shift @_;
924 my ($pluginfo) = @_;
925
926 my $inexport_mode = $self->{'mode'};
927 my $out = $self->{'out'};
928 my $faillogname = $self->{'faillogname'};
929 my $gli = $self->{'gli'};
930
931 &gsprintf($out, "\n");
932 &gsprintf($out, "*********************************************\n");
933 &gsprintf($out, "{$inexport_mode.complete}\n");
934 &gsprintf($out, "*********************************************\n");
935
936 &plugin::write_stats($pluginfo, 'STDERR', $faillogname, $gli);
937}
938# generate_statistics()
939
940
941# @function deinit()
942# Close down any file handles that we opened (and hence are responsible for
943# closing
944sub deinit
945{
946 my $self = shift(@_);
947 close OUT if $self->{'close_out'};
948 close FAILLOG if $self->{'close_faillog'};
949}
950# deinit()
951
952
953sub store_collectionmeta
954{
955 my ($collectionmeta,$field,$doc_obj) = @_;
956
957 my $section = $doc_obj->get_top_section();
958
959 my $field_hash = $collectionmeta->{$field};
960
961 foreach my $k (keys %$field_hash)
962 {
963 my $val = $field_hash->{$k};
964
965 ### print STDERR "*** $k = $field_hash->{$k}\n";
966
967 my $md_label = "ex.$field";
968
969
970 if ($k =~ m/^\[l=(.*?)\]$/)
971 {
972
973 my $md_suffix = $1;
974 $md_label .= "^$md_suffix";
975 }
976
977
978 $doc_obj->add_utf8_metadata($section,$md_label, $val);
979
980 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
981 # while "collectionname" in GS2 is called "name" in GS3.
982 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
983 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
984 {
985 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
986 }
987
988 }
989}
990
991
992sub oid_count_file {
993 my ($archivedir) = @_;
994 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
995}
996
997
998sub prime_doc_oid_count
999{
1000 my ($archivedir) = @_;
1001 my $oid_count_filename = &oid_count_file($archivedir);
1002
1003 if (-e $oid_count_filename) {
1004 if (open(OIDIN,"<$oid_count_filename")) {
1005 my $OIDcount = <OIDIN>;
1006 chomp $OIDcount;
1007 close(OIDIN);
1008
1009 $doc::OIDcount = $OIDcount;
1010 }
1011 else {
1012 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1013 }
1014 }
1015
1016}
1017
1018sub store_doc_oid_count
1019{
1020 # Use the file "OIDcount" in the archives directory to record
1021 # what value doc.pm got up to
1022
1023 my ($archivedir) = @_;
1024 my $oid_count_filename = &oid_count_file($archivedir);
1025
1026 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1027 if (open(OIDOUT,">$oid_count_filename")) {
1028 print OIDOUT $doc::OIDcount, "\n";
1029
1030 close(OIDOUT);
1031 }
1032 else {
1033 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1034 }
1035}
1036
1037
1038
1039sub new_vs_old_import_diff
1040{
1041 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1042
1043 # Get the infodbtype value for this collection from the arcinfo object
1044 my $infodbtype = $archive_info->{'infodbtype'};
1045
1046 # in this method, we want to know if metadata files are modified or not.
1047 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1048
1049 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1050
1051 # First convert all files to absolute form
1052 # This is to support the situation where the import folder is not
1053 # the default
1054
1055 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1056 my $full_prev_all_files = {};
1057
1058 foreach my $prev_file (keys %$prev_all_files) {
1059
1060 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1061 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1062 $full_prev_all_files->{$full_prev_file} = $prev_file;
1063 }
1064 else {
1065 $full_prev_all_files->{$prev_file} = $prev_file;
1066 }
1067 }
1068
1069
1070 # Figure out which are the new files, existing files and so
1071 # by implication the files from the previous import that are not
1072 # there any more => mark them for deletion
1073 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1074
1075 my $full_curr_file = $curr_file;
1076
1077 # entry in 'all_files' is moved to either 'existing_files',
1078 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1079
1080 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1081 # add in import dir to make absolute
1082 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1083 }
1084
1085 # figure out if new file or not
1086 if (defined $full_prev_all_files->{$full_curr_file}) {
1087 # delete it so that only files that need deleting are left
1088 delete $full_prev_all_files->{$full_curr_file};
1089
1090 # had it before. is it a metadata file?
1091 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1092
1093 # is it modified??
1094 if (-M $full_curr_file < $archiveinf_timestamp) {
1095 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1096 # its newer than last build
1097 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1098 }
1099 }
1100 else {
1101 if ($incremental_mode eq "all") {
1102
1103 # had it before
1104 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1105
1106 }
1107 else {
1108 # Warning in "onlyadd" mode, but had it before!
1109 print STDERR "Warning: File $full_curr_file previously imported.\n";
1110 print STDERR " Treating as new file\n";
1111
1112 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1113
1114 }
1115 }
1116 }
1117 else {
1118 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1119 # the new file is the special sort of file greenstone uses
1120 # to attach metadata to src documents
1121 # i.e metadata.xml
1122 # (but note, the filename used is not constrained in
1123 # Greenstone to always be this)
1124
1125 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1126 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1127 }
1128 else {
1129 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1130 }
1131 }
1132
1133
1134 delete $block_hash->{'all_files'}->{$curr_file};
1135 }
1136
1137
1138
1139
1140 # Deal with complication of new or modified metadata files by forcing
1141 # everything from this point down in the file hierarchy to
1142 # be freshly imported.
1143 #
1144 # This may mean files that have not changed are reindexed, but does
1145 # guarantee by the end of processing all new metadata is correctly
1146 # associated with the relevant document(s).
1147
1148 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1149 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1150
1151 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1152 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1153
1154 # Go through existing_files, and mark anything that is contained
1155 # within 'situated_dir' to be reindexed (in case some of the metadata
1156 # attaches to one of these files)
1157
1158 my $reindex_files = [];
1159
1160 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1161
1162 if ($existing_f =~ m/^$situated_dir/) {
1163
1164 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1165
1166 push(@$reindex_files,$existing_f);
1167 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1168 delete $block_hash->{'existing_files'}->{$existing_f};
1169
1170 }
1171 }
1172
1173 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1174 # (or equivalent)
1175 $block_hash->{'new_files'}->{$new_mdf} = 1;
1176
1177 }
1178
1179 # go through remaining existing files and work out what has changed and needs to be reindexed.
1180 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1181
1182 my $reindex_files = [];
1183
1184 foreach my $existing_filename (@existing_files) {
1185 if (-M $existing_filename < $archiveinf_timestamp) {
1186 # file is newer than last build
1187
1188 my $existing_file = $existing_filename;
1189 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1190
1191 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1192 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1193
1194 print STDERR "**** Reindexing existing file: $existing_file\n";
1195
1196 push(@$reindex_files,$existing_file);
1197 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1198 }
1199
1200 }
1201
1202
1203 # By this point full_prev_all_files contains the files
1204 # mentioned in archiveinf-src.db but are not in the 'import'
1205 # folder (or whatever was specified through -importdir ...)
1206
1207 # This list can contain files that were created in the 'tmp' or
1208 # 'cache' areas (such as screen-size and thumbnail images).
1209 #
1210 # In building the final list of files to delete, we test to see if
1211 # it exists on the filesystem and if it does (unusual for a "normal"
1212 # file in import, but possible in the case of 'tmp' files),
1213 # supress it from going into the final list
1214
1215 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1216
1217 my @deleted_files = values %$full_prev_all_files;
1218 map { my $curr_file = $_;
1219 my $full_curr_file = $curr_file;
1220
1221 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1222 # add in import dir to make absolute
1223
1224 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1225 }
1226
1227
1228 if (!-e $full_curr_file) {
1229 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1230 }
1231 } @deleted_files;
1232
1233
1234
1235}
1236
1237
1238# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1239# $mode is 'delete' or 'reindex'
1240sub mark_docs_for_deletion
1241{
1242 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1243
1244 my $mode_text = "deleted from index";
1245 if ($mode eq "reindex") {
1246 $mode_text = "reindexed";
1247 }
1248
1249 # Get the infodbtype value for this collection from the arcinfo object
1250 my $infodbtype = $archive_info->{'infodbtype'};
1251
1252 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1253 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1254
1255
1256 # record files marked for deletion in arcinfo
1257 foreach my $file (@$deleted_files) {
1258 # use 'archiveinf-src' info database file to look up all the OIDs
1259 # that this file is used in (note in most cases, it's just one OID)
1260
1261 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1262 my $oids = $src_rec->{'oid'};
1263 my $file_record_deleted = 0;
1264
1265 # delete the src record
1266 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1267 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1268 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1269
1270
1271 foreach my $oid (@$oids) {
1272
1273 # find the source doc (the primary file that becomes this oid)
1274 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1275 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1276 if (!&util::filename_is_absolute($doc_source_file)) {
1277 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1278 }
1279
1280 if ($doc_source_file ne $file) {
1281 # its an associated or metadata file
1282
1283 # mark source doc for reimport as one of its assoc files has changed or deleted
1284 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1285
1286 }
1287 my $curr_status = $archive_info->get_status_info($oid);
1288 if (defined($curr_status) && (($curr_status ne "D"))) {
1289 if ($verbosity>1) {
1290 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1291 }
1292 # mark oid for deletion (it will be deleted or reimported)
1293 $archive_info->set_status_info($oid,"D");
1294 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1295 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1296
1297 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1298 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1299
1300 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1301 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1302 }
1303 }
1304
1305 }
1306
1307 # now go through and check that we haven't marked any primary
1308 # files for reindex (because their associated files have
1309 # changed/deleted) when they have been deleted themselves. only in
1310 # delete mode.
1311
1312 if ($mode eq "delete") {
1313 foreach my $file (@$deleted_files) {
1314 if (defined $block_hash->{'reindex_files'}->{$file}) {
1315 delete $block_hash->{'reindex_files'}->{$file};
1316 }
1317 }
1318 }
1319
1320
1321}
1322
1323sub add_dir_contents_to_list {
1324
1325 my ($dirname, $list) = @_;
1326
1327 # Recur over directory contents.
1328 my (@dir, $subfile);
1329
1330 # find all the files in the directory
1331 if (!opendir (DIR, $dirname)) {
1332 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1333 return -1; # error in processing
1334 }
1335 @dir = readdir (DIR);
1336 closedir (DIR);
1337
1338 for (my $i = 0; $i < scalar(@dir); $i++) {
1339 my $subfile = $dir[$i];
1340 next if ($subfile =~ m/^\.\.?$/);
1341 next if ($subfile =~ /^\.svn$/);
1342 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1343 if (-d $full_file) {
1344 &add_dir_contents_to_list($full_file, $list);
1345 } else {
1346 push (@$list, $full_file);
1347 }
1348 }
1349
1350}
1351
1352
13531;
Note: See TracBrowser for help on using the repository browser.