source: gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm@ 26932

Last change on this file since 26932 was 26932, checked in by jmt12, 11 years ago

Altered all calls to built-in perl file tests to instead use util library ones. This allows better awareness of HDFS or other strange file paths. Added support for newer version of manifest (where files are followed verbatim). Only write OIDcount for numerical OID collections

File size: 42.9 KB
RevLine 
[24626]1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # add collection's perllib dir into include path in
176 # case we have collection specific modules
177 my $collection_perllib_path = $ENV{'GSDLCOLLECTDIR'} . '/perllib';
178 my $inc_paths = join(':', @INC);
179 if ($inc_paths !~ /$collection_perllib_path/)
180 {
181 unshift (@INC, $collection_perllib_path); # [jmt12]
182 }
183
184 # check that we can open the faillog
185 my $faillog = $self->{'faillog'};
186 if ($faillog eq "") {
187 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
188 }
189 open (FAILLOG, ">$faillog") ||
190 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
191
192
193 my $faillogname = $faillog;
194 $faillog = 'inexport::FAILLOG';
195 $faillog->autoflush(1);
196 $self->{'faillog'} = $faillog;
197 $self->{'faillogname'} = $faillogname;
198
199 # Read in the collection configuration file.
200 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
201 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
202
203 return ($config_filename,$collectcfg);
204}
205
206sub set_collection_options
207{
208 my $self = shift @_;
209 my ($collectcfg) = @_;
210
211 my $inexport_mode = $self->{'mode'};
212
213 my $verbosity = $self->{'verbosity'};
214 my $debug = $self->{'debug'};
215 my $importdir = $self->{'importdir'};
216 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
217 my $out = $self->{'out'};
218
219 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
220 if (!defined($collectcfg->{'infodbtype'}))
221 {
222 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
223 }
224 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
225 # we can't use the text version for archives dbs.
226 $collectcfg->{'infodbtype'} = "gdbm";
227 }
228 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
229 $importdir = $collectcfg->{'importdir'};
230 }
231 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
232 $archivedir = $collectcfg->{'archivedir'};
233 }
234 # fill in the default import and archives directories if none
235 # were supplied, turn all \ into / and remove trailing /
[26932]236 if ($importdir eq "")
237 {
238 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import");
239 }
240 else
241 {
242 # hijack filename_cat to sanitize the user provided importdir [jmt12]
243 $importdir = &util::filename_cat ($importdir);
244 }
245 if (!&util::dir_exists($importdir)) {
[24626]246 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
247 die "\n";
248 }
249 $self->{'importdir'} = $importdir;
250
251 if ($archivedir eq "") {
252 if ($inexport_mode eq "import") {
253 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
254 }
255 elsif ($inexport_mode eq "export") {
256 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
257 }
258 else {
259 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
260 print STDERR " Defaulting to 'archives' for file output\n";
261 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
262 }
263 }
[26932]264 else
265 {
266 # use filename_cat() to sanitize the user provided archive directory as
267 # it is more aware of protocols etc
268 $archivedir = &util::filename_cat($archivedir);
269 }
[24626]270 $self->{'archivedir'} = $archivedir;
271
272 if ($verbosity !~ /\d+/) {
273 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
274 $verbosity = $collectcfg->{'verbosity'};
275 } else {
276 $verbosity = 2; # the default
277 }
278 }
279 $self->{'verbosity'} = $verbosity;
280
281 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
282 $self->{'manifest'} = $collectcfg->{'manifest'};
283 }
[26932]284 # Default value
285 $self->{'manifest_version'} = 0;
[24626]286
287 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
288 if ($collectcfg->{'gzip'} =~ /^true$/i) {
289 $self->{'gzip'} = 1;
290 }
291 }
292
293 if ($self->{'maxdocs'} !~ /\-?\d+/) {
294 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
295 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
296 } else {
297 $self->{'maxdocs'} = -1; # the default
298 }
299 }
300
301 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
302 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
303 $self->{'groupsize'} = $collectcfg->{'groupsize'};
304 }
305 }
306
307 if (!defined $self->{'OIDtype'}
308 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
309 if (defined $collectcfg->{'OIDtype'}
310 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
311 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
312 } else {
313 $self->{'OIDtype'} = "hash"; # the default
314 }
315 }
316
317 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
318 if (defined $collectcfg->{'OIDmetadata'}) {
319 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
320 } else {
321 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
322 }
323 }
324
325 my $sortmeta = $self->{'sortmeta'};
326 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
327 $sortmeta = $collectcfg->{'sortmeta'};
328 }
329 # sortmeta cannot be used with group size
330 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
331 if (defined $sortmeta && $self->{'groupsize'} > 1) {
332 &gsprintf($out, "{import.cannot_sort}\n\n");
333 $sortmeta = undef;
334 }
335 $self->{'sortmeta'} = $sortmeta;
336
337 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
338 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
339 }
340
341 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
342 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
343 }
344 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
345 $self->{'debug'} = 1;
346 }
347 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
348 $self->{'gli'} = 1;
349 }
350 $self->{'gli'} = 0 unless defined $self->{'gli'};
351
352 # check keepold and removeold
353 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
354
355 my ($removeold, $keepold, $incremental, $incremental_mode)
356 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
357 $self->{'incremental'}, $checkdir,
358 $collectcfg);
359
360 $self->{'removeold'} = $removeold;
361 $self->{'keepold'} = $keepold;
362 $self->{'incremental'} = $incremental;
363 $self->{'incremental_mode'} = $incremental_mode;
[24686]364
365 # Since this wasted my morning, let's at least warn a user that manifest
366 # files now *only* work if keepold is set. [jmt12]
367 if ($self->{'manifest'} && !$self->{'keepold'})
368 {
369 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental. Ignoring.\n";
370 }
[24626]371}
372
373sub process_files
374{
375 my $self = shift @_;
376 my ($config_filename,$collectcfg) = @_;
377
378 my $inexport_mode = $self->{'mode'};
379
380 my $verbosity = $self->{'verbosity'};
381 my $debug = $self->{'debug'};
382
383 my $importdir = $self->{'importdir'};
384 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
385
386 my $incremental = $self->{'incremental'};
387 my $incremental_mode = $self->{'incremental_mode'};
388
389 my $removeold = $self->{'removeold'};
390 my $keepold = $self->{'keepold'};
391
392 my $saveas = $self->{'saveas'};
393 my $OIDtype = $self->{'OIDtype'};
394 my $OIDmetadata = $self->{'OIDmetadata'};
395
396 my $out = $self->{'out'};
397 my $faillog = $self->{'faillog'};
398
399 my $maxdocs = $self->{'maxdocs'};
400 my $gzip = $self->{'gzip'};
401 my $groupsize = $self->{'groupsize'};
402 my $sortmeta = $self->{'sortmeta'};
403
404 my $removeprefix = $self->{'removeprefix'};
405 my $removesuffix = $self->{'removesuffix'};
406
407 my $gli = $self->{'gli'};
408
409 my $jobs = $self->{'jobs'};
410 my $epoch = $self->{'epoch'};
411
412 # related to export
413 my $xsltfile = $self->{'xsltfile'};
414 my $group_marc = $self->{'group_marc'};
415 my $mapping_file = $self->{'mapping_file'};
416 my $xslt_mets = $self->{'xslt_mets'};
417 my $xslt_txt = $self->{'xslt_txt'};
418 my $fedora_namespace = $self->{'fedora_namespace'};
419 my $metadata_prefix = $self->{'metadata_prefix'};
420
421 if ($inexport_mode eq "import") {
422 print STDERR "<Import>\n" if $gli;
423 }
424 else {
425 print STDERR "<export>\n" if $gli;
426 }
427
428 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
429 if ($self->{'manifest'} ne "") {
430 my $manifest_filename = $self->{'manifest'};
431
432 if (!&util::filename_is_absolute($manifest_filename)) {
433 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
434 }
435
436 $self->{'manifest'} =~ s/[\\\/]+/\//g;
437 $self->{'manifest'} =~ s/\/$//;
438
439 $manifest_lookup->parse($manifest_filename);
[26932]440
441 # Manifests may now include a version number
442 $self->{'manifest_version'} = $manifest_lookup->get_version();
[24626]443 }
444
445 my $manifest = $self->{'manifest'};
446
447 # load all the plugins
448 my $plugins = [];
449 if (defined $collectcfg->{'plugin'}) {
450 $plugins = $collectcfg->{'plugin'};
451 }
452
453 my $plugin_incr_mode = $incremental_mode;
454 if ($manifest ne "") {
455 # if we have a manifest file, then we pretend we are fully incremental for plugins
456 $plugin_incr_mode = "all";
457 }
458 #some global options for the plugins
459 my @global_opts = ();
460
461 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
462 if (scalar(@$pluginfo) == 0) {
463 &gsprintf($out, "{import.no_plugins_loaded}\n");
464 die "\n";
465 }
466
467 # remove the old contents of the archives directory (and tmp
468 # directory) if needed
469
470 if ($removeold) {
[26932]471 if (&util::dir_exists($archivedir)) {
[24626]472 &gsprintf($out, "{import.removing_archives}\n");
473 &util::rm_r ($archivedir);
474 }
475 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
476 $tmpdir =~ s/[\\\/]+/\//g;
477 $tmpdir =~ s/\/$//;
[26932]478 if (&util::dir_exists($tmpdir)) {
479 &gsprintf($out, "{import.removing_tmpdir}\n");
480 &util::rm_r ($tmpdir);
[24626]481 }
482 }
483
484 # create the archives dir if needed
485 &util::mk_all_dir($archivedir);
486
487 # read the archive information file
488
489 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
490 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
491 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
492
[24686]493 # Warning! Black magic follows. When the following functions are called on
494 # the GDBMServer class they will actually prompt the running of the Server
495 # and attach themselves as a listener (even though they don't do anything)
496 # This is done so that, in parallel importing, the server will persist
[26932]497 # until the top level import.pl (which will be the first that calls this
498 # function) completes. [jmt12]
[25401]499 my $create_server = 0;
500 # - infodb's of type *server need to be started on the same machine that
501 # runs parallel-import.sh, especially when distributed processing
502 # gets involved.
503 if ($collectcfg->{'infodbtype'} =~ /server$/)
504 {
505 $create_server = 1;
506 }
507 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $create_server);
508 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $create_server);
[24686]509
[24626]510 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
511 $archive_info->load_info ($arcinfo_doc_filename);
512
513 if ($manifest eq "") {
514 # Load in list of files in import folder from last import (if present)
515 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
516 }
517
518 ####Use Plugout####
519 my $plugout;
520
521 if ($inexport_mode eq "import") {
522 if (defined $collectcfg->{'plugout'}) {
523 # If a plugout was specified in the collect.cfg file, assume it is sensible
524 # We can't check the name because it could be anything, if it is a custom plugout
525 $plugout = $collectcfg->{'plugout'};
526 }
527 else{
528 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
529 push @$plugout,"GreenstoneXMLPlugout";
530 }
531 else{
532 push @$plugout,$saveas."Plugout";
533 }
534 }
535 }
536 else {
537 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
538 $plugout = $collectcfg->{'plugout'};
539 }
540 else{
541 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
542 push @$plugout,"GreenstoneMETSPlugout";
543 }
544 else{
545 push @$plugout,$saveas."Plugout";
546 }
547 }
548 }
549
550 my $plugout_name = $plugout->[0];
551
552 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
553 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
554 push @$plugout,("-debug") if ($debug);
555 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
556 push @$plugout,("-gzip_output") if ($gzip);
557 push @$plugout,("-output_handle",$out) if (defined $out);
558
559 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
560
561 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
562 push @$plugout,("-group") if ($group_marc);
563 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
564 }
565 if ($plugout_name =~ m/^.*METSPlugout$/) {
566 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
567 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
568 }
569
570 if ($plugout_name eq "FedoraMETSPlugout") {
571 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
572 }
573
574 if ($plugout_name eq "DSpacePlugout") {
575 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
576 }
577
[26932]578 my $processor = &plugout::load_plugout($plugout);
[24626]579 $processor->setoutputdir ($archivedir);
580 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
581 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
582
583 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
584
585 if ($removeold) {
586 # occasionally, plugins may want to do something on remove
587 # old, eg pharos image indexing
588 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
589 }
590
591 # process the import directory
592 my $block_hash = {};
593 $block_hash->{'new_files'} = {};
594 $block_hash->{'reindex_files'} = {};
[24686]595 # All of these are set somewhere else, so it's kinda nice to define them
596 # here. [jmt12]
597 $block_hash->{'all_files'} = {};
598 $block_hash->{'deleted_files'} = {};
599 $block_hash->{'file_blocks'} = {};
600 $block_hash->{'metadata_files'} = {};
601 $block_hash->{'shared_fileroot'} = '';
602 # My new flag so we can tell we had a manifest way down in the plugins
603 # [jmt12]
604 $block_hash->{'manifest'} = 'false';
[24626]605 my $metadata = {};
606
607 # global blocking pass may set up some metadata
[24686]608 # - when we have a manifest file we don't do this -unless- the collection
609 # configuration indicates this collection contains complex (inherited)
610 # metadata. [jmt12]
611 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
612 {
[25401]613 #print "Temporarily skipping global file scan as it is slow as molasses\n";
614 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
[24686]615 }
616 else
617 {
618 print "Skipping global file scan due to manifest and complexmeta configuration\n";
619 }
[24626]620
621 if ($manifest ne "") {
[24686]622
623 $block_hash->{'manifest'} = 'true';
624
[24626]625 #
626 # 1. Process delete files first
627 #
628 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
629 my @full_deleted_files = ();
630
631 # ensure all filenames are absolute
632 foreach my $df (@deleted_files) {
633 my $full_df =
634 (&util::filename_is_absolute($df))
635 ? $df
636 : &util::filename_cat($importdir,$df);
637
638 if (-d $full_df) {
639 &add_dir_contents_to_list($full_df, \@full_deleted_files);
640 } else {
641 push(@full_deleted_files,$full_df);
642 }
643 }
644
645 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
646 mark_docs_for_deletion($archive_info,{},
647 \@full_deleted_files,
648 $archivedir, $verbosity, "delete");
649
650
651 #
652 # 2. Now files for reindexing
653 #
654
655 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
656 my @full_reindex_files = ();
657 # ensure all filenames are absolute
658 foreach my $rf (@reindex_files) {
659 my $full_rf =
660 (&util::filename_is_absolute($rf))
661 ? $rf
662 : &util::filename_cat($importdir,$rf);
663
664 if (-d $full_rf) {
665 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
666 } else {
667 push(@full_reindex_files,$full_rf);
668 }
669 }
670
671 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
672 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
673
674 # And now to ensure the new version of the file processed by
675 # appropriate plugin, we need to add it to block_hash reindex list
676 foreach my $full_rf (@full_reindex_files) {
677 $block_hash->{'reindex_files'}->{$full_rf} = 1;
678 }
679
680
681 #
682 # 3. Now finally any new files - add to block_hash new_files list
683 #
684
685 my @new_files = keys %{$manifest_lookup->{'index'}};
686 my @full_new_files = ();
687
688 foreach my $nf (@new_files) {
689 # ensure filename is absolute
690 my $full_nf =
691 (&util::filename_is_absolute($nf))
692 ? $nf
693 : &util::filename_cat($importdir,$nf);
694
695 if (-d $full_nf) {
696 &add_dir_contents_to_list($full_nf, \@full_new_files);
697 } else {
698 push(@full_new_files,$full_nf);
699 }
700 }
701
702 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
[24686]703 # need to check this file exists before trying to read it. [jmt12]
704 if (-e $arcinfo_src_filename)
705 {
[24626]706 my $arcinfodb_map = {};
707 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
708 foreach my $f (@full_new_files) {
709 # check that we haven't seen it already
710 if (defined $arcinfodb_map->{$f}) {
711 # TODO make better warning
712 print STDERR "Warning: $f already in src archive, \n";
713 } else {
714 $block_hash->{'new_files'}->{$f} = 1;
715 }
716 }
[24686]717 undef $arcinfodb_map;
718 }
719 # no existing files - so we can just add all the ones that need adding.
720 # [jmt12]
721 else
722 {
723 foreach my $f (@full_new_files)
724 {
725 $block_hash->{'new_files'}->{$f} = 1;
726 }
727 }
[24626]728
[24686]729 # If we are not using complex inherited metadata (and thus have skipped
[26932]730 # the global file scan) we need to at least check for a matching
731 # metadata.xml for the files being indexed/reindexed. [jmt12]
732 # - unless we are using the newer version of Manifests, which are treated
733 # verbatim, and should have a metadata element for metadata files (so
734 # we can explicitly process metadata files other than metadata.xml)
735 if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
[24686]736 {
737 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
738 foreach my $file_to_import (@all_files_to_import)
739 {
[26932]740 my $metadata_xml_path = $file_to_import;
741 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
742 if (&util::file_exists($metadata_xml_path))
[24686]743 {
[26932]744 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
[24686]745 }
746 }
747 }
[26932]748
749 if ($self->{'manifest_version'} > 0)
750 {
751 # Process metadata files (?)
752 # Process files
753 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
754 {
755 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
756 }
757 }
[24626]758 }
759 else {
760 # if incremental, we read through the import folder to see whats changed.
761
762 if ($incremental || $incremental_mode eq "onlyadd") {
763 prime_doc_oid_count($archivedir);
764
765 # Can now work out which files were new, already existed, and have
766 # been deleted
767
768 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
769 $archivedir,$verbosity,$incremental_mode);
770
771 my @new_files = sort keys %{$block_hash->{'new_files'}};
772 if (scalar(@new_files>0)) {
773 print STDERR "New files and modified metadata files since last import:\n ";
774 print STDERR join("\n ",@new_files), "\n";
775 }
776
777 if ($incremental) {
778 # only look for deletions if we are truely incremental
779 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
780 # Filter out any in gsdl/tmp area
781 my @filtered_deleted_files = ();
782 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
783 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
784 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
785 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
786
787 foreach my $df (@deleted_files) {
788 next if ($df =~ m/^$gsdl_tmp_area/);
789 next if ($df =~ m/^$collect_tmp_area/);
790
791 push(@filtered_deleted_files,$df);
792 }
793
794
795 @deleted_files = @filtered_deleted_files;
796
797 if (scalar(@deleted_files)>0) {
798 print STDERR "Files deleted since last import:\n ";
799 print STDERR join("\n ",@deleted_files), "\n";
800
801
802 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
803
804 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
805 }
806
807 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
808
809 if (scalar(@reindex_files)>0) {
810 print STDERR "Files to reindex since last import:\n ";
811 print STDERR join("\n ",@reindex_files), "\n";
812 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
813 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
814 }
815
816 }
817 }
818 }
819
820 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
821 # Do nothing if the file already exists (file exists on incremental build).
822 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
823 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
824 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
825 # oailastmodified and oailastmodifieddate
826 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
827 if (!-f $earliestDatestampFile && -d $archivedir) {
828 my $current_time_in_seconds = time; # in seconds
829
830 if(open(FOUT, ">$earliestDatestampFile")) {
831 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
832 print FOUT $current_time_in_seconds;
833 close(FOUT);
834 }
835 else {
836 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
837 }
838
839 }
840
841 # now, whichever mode we are in, we can process the entire import folder
842 if ((defined $jobs) && ($jobs > 1))
843 {
844 # if jobs are set to >1, run in parallel using MPI helper
845 # [hs, 1 july 2010]
846 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
847 $self->{'collection'}, $self->{'site'});
848 }
[26932]849 # only do this if we aren't using the newer paradigm for manifest files
850 elsif ($self->{'manifest_version'} < 1)
[24626]851 {
852 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
853 }
854
855
856 if ($saveas eq "FedoraMETS") {
857 # create collection "doc obj" for Fedora that contains
858 # collection-level metadata
859
860 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
861 $doc_obj->set_OID("collection");
862
863 my $col_name = undef;
864 my $col_meta = $collectcfg->{'collectionmeta'};
865
866 if (defined $col_meta) {
867 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
868 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
869 }
870 $processor->process($doc_obj);
871 }
872
873 &plugin::end($pluginfo, $processor);
874
875 &plugin::deinit($pluginfo, $processor);
876
877 # Store the value of OIDCount (used in doc.pm) so it can be
878 # restored correctly to this value on an incremental build
[26932]879 # - this OIDcount file should only be generated for numerical oids [jmt12]
880 if ($self->{'OIDtype'} eq 'incremental')
881 {
882 store_doc_oid_count($archivedir);
883 }
[24626]884
885 # write out the archive information file
886 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
887 $processor->close_group_output() if $processor->is_group();
888
889 # for backwards compatability with archvies.inf file
890 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
891 $archive_info->save_info($arcinfo_doc_filename);
892 }
893 else {
894 $archive_info->save_revinfo_db($arcinfo_src_filename);
895 }
896
897 return $pluginfo;
898}
899
900
901sub generate_statistics
902{
903 my $self = shift @_;
904 my ($pluginfo) = @_;
905
906 my $inexport_mode = $self->{'mode'};
907
908 my $statsfile = $self->{'statsfile'};
909 my $out = $self->{'out'};
910 my $faillogname = $self->{'faillogname'};
911 my $gli = $self->{'gli'};
912 my $jobs = $self->{'jobs'};
913
914 # write out import stats
915
916 if ((!defined $jobs) || ($jobs == 1))
917 {
918 # only output statistics if there are multiple jobs
919 # [hs, 1 july 2010]
920
921 my $close_stats = 0;
922 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
923 if (open (STATS, ">$statsfile")) {
924 $statsfile = 'inexport::STATS';
925 $close_stats = 1;
926 } else {
927 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
928 &gsprintf($out, "{import.stats_backup}\n");
929 $statsfile = 'STDERR';
930 }
931 }
932
933 &gsprintf($out, "\n");
934 &gsprintf($out, "*********************************************\n");
935 &gsprintf($out, "{$inexport_mode.complete}\n");
936 &gsprintf($out, "*********************************************\n");
937
938 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
939 if ($close_stats) {
940 close STATS;
941 }
942 }
943
944 close OUT if $self->{'close_out'};
945 close FAILLOG;
946}
947
948
949sub store_collectionmeta
950{
951 my ($collectionmeta,$field,$doc_obj) = @_;
952
953 my $section = $doc_obj->get_top_section();
954
955 my $field_hash = $collectionmeta->{$field};
956
957 foreach my $k (keys %$field_hash)
958 {
959 my $val = $field_hash->{$k};
960
961 ### print STDERR "*** $k = $field_hash->{$k}\n";
962
963 my $md_label = "ex.$field";
964
965
966 if ($k =~ m/^\[l=(.*?)\]$/)
967 {
968
969 my $md_suffix = $1;
970 $md_label .= "^$md_suffix";
971 }
972
973
974 $doc_obj->add_utf8_metadata($section,$md_label, $val);
975
976 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
977 # while "collectionname" in GS2 is called "name" in GS3.
978 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
979 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
980 {
981 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
982 }
983
984 }
985}
986
987
988sub oid_count_file {
989 my ($archivedir) = @_;
990 return &util::filename_cat ($archivedir, "OIDcount");
991}
992
993
994sub prime_doc_oid_count
995{
996 my ($archivedir) = @_;
997 my $oid_count_filename = &oid_count_file($archivedir);
998
999 if (-e $oid_count_filename) {
1000 if (open(OIDIN,"<$oid_count_filename")) {
1001 my $OIDcount = <OIDIN>;
1002 chomp $OIDcount;
1003 close(OIDIN);
1004
1005 $doc::OIDcount = $OIDcount;
1006 }
1007 else {
1008 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1009 }
1010 }
1011
1012}
1013
1014sub store_doc_oid_count
1015{
1016 # Use the file "OIDcount" in the archives directory to record
1017 # what value doc.pm got up to
1018
1019 my ($archivedir) = @_;
1020 my $oid_count_filename = &oid_count_file($archivedir);
1021
1022
[26932]1023 if (open(OIDOUT,&util::file_openfdcommand($oid_count_filename, '>'))) {
[24626]1024 print OIDOUT $doc::OIDcount, "\n";
1025
1026 close(OIDOUT);
1027 }
1028 else {
1029 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1030 }
1031}
1032
1033
1034
1035sub new_vs_old_import_diff
1036{
1037 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1038
1039 # Get the infodbtype value for this collection from the arcinfo object
1040 my $infodbtype = $archive_info->{'infodbtype'};
1041
1042 # in this method, we want to know if metadata files are modified or not.
1043 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1044
1045 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1046
1047 # First convert all files to absolute form
1048 # This is to support the situation where the import folder is not
1049 # the default
1050
1051 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1052 my $full_prev_all_files = {};
1053
1054 foreach my $prev_file (keys %$prev_all_files) {
1055
1056 if (!&util::filename_is_absolute($prev_file)) {
1057 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
1058 $full_prev_all_files->{$full_prev_file} = $prev_file;
1059 }
1060 else {
1061 $full_prev_all_files->{$prev_file} = $prev_file;
1062 }
1063 }
1064
1065
1066 # Figure out which are the new files, existing files and so
1067 # by implication the files from the previous import that are not
1068 # there any more => mark them for deletion
1069 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1070
1071 my $full_curr_file = $curr_file;
1072
1073 # entry in 'all_files' is moved to either 'existing_files',
1074 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1075
1076 if (!&util::filename_is_absolute($curr_file)) {
1077 # add in import dir to make absolute
1078 $full_curr_file = &util::filename_cat($importdir,$curr_file);
1079 }
1080
1081 # figure out if new file or not
1082 if (defined $full_prev_all_files->{$full_curr_file}) {
1083 # delete it so that only files that need deleting are left
1084 delete $full_prev_all_files->{$full_curr_file};
1085
1086 # had it before. is it a metadata file?
1087 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1088
1089 # is it modified??
1090 if (-M $full_curr_file < $archiveinf_timestamp) {
1091 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1092 # its newer than last build
1093 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1094 }
1095 }
1096 else {
1097 if ($incremental_mode eq "all") {
1098
1099 # had it before
1100 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1101
1102 }
1103 else {
1104 # Warning in "onlyadd" mode, but had it before!
1105 print STDERR "Warning: File $full_curr_file previously imported.\n";
1106 print STDERR " Treating as new file\n";
1107
1108 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1109
1110 }
1111 }
1112 }
1113 else {
1114 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1115 # the new file is the special sort of file greenstone uses
1116 # to attach metadata to src documents
1117 # i.e metadata.xml
1118 # (but note, the filename used is not constrained in
1119 # Greenstone to always be this)
1120
1121 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1122 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1123 }
1124 else {
1125 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1126 }
1127 }
1128
1129
1130 delete $block_hash->{'all_files'}->{$curr_file};
1131 }
1132
1133
1134
1135
1136 # Deal with complication of new or modified metadata files by forcing
1137 # everything from this point down in the file hierarchy to
1138 # be freshly imported.
1139 #
1140 # This may mean files that have not changed are reindexed, but does
1141 # guarantee by the end of processing all new metadata is correctly
1142 # associated with the relevant document(s).
1143
1144 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1145 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1146
1147 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1148 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
1149
1150 # Go through existing_files, and mark anything that is contained
1151 # within 'situated_dir' to be reindexed (in case some of the metadata
1152 # attaches to one of these files)
1153
1154 my $reindex_files = [];
1155
1156 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1157
1158 if ($existing_f =~ m/^$situated_dir/) {
1159
1160 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1161
1162 push(@$reindex_files,$existing_f);
1163 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1164 delete $block_hash->{'existing_files'}->{$existing_f};
1165
1166 }
1167 }
1168
1169 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1170 # (or equivalent)
1171 $block_hash->{'new_files'}->{$new_mdf} = 1;
1172
1173 }
1174
1175 # go through remaining existing files and work out what has changed and needs to be reindexed.
1176 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1177
1178 my $reindex_files = [];
1179
1180 foreach my $existing_filename (@existing_files) {
1181 if (-M $existing_filename < $archiveinf_timestamp) {
1182 # file is newer than last build
1183
1184 my $existing_file = $existing_filename;
1185 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1186
1187 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1188 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1189
1190 print STDERR "**** Reindexing existing file: $existing_file\n";
1191
1192 push(@$reindex_files,$existing_file);
1193 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1194 }
1195
1196 }
1197
1198
1199 # By this point full_prev_all_files contains the files
1200 # mentioned in archiveinf-src.db but are not in the 'import'
1201 # folder (or whatever was specified through -importdir ...)
1202
1203 # This list can contain files that were created in the 'tmp' or
1204 # 'cache' areas (such as screen-size and thumbnail images).
1205 #
1206 # In building the final list of files to delete, we test to see if
1207 # it exists on the filesystem and if it does (unusual for a "normal"
1208 # file in import, but possible in the case of 'tmp' files),
1209 # supress it from going into the final list
1210
1211 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1212
1213 my @deleted_files = values %$full_prev_all_files;
1214 map { my $curr_file = $_;
1215 my $full_curr_file = $curr_file;
1216
1217 if (!&util::filename_is_absolute($curr_file)) {
1218 # add in import dir to make absolute
1219
1220 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1221 }
1222
1223
1224 if (!-e $full_curr_file) {
1225 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1226 }
1227 } @deleted_files;
1228
1229
1230
1231}
1232
1233
1234# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1235# $mode is 'delete' or 'reindex'
1236sub mark_docs_for_deletion
1237{
1238 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1239
1240 my $mode_text = "deleted from index";
1241 if ($mode eq "reindex") {
1242 $mode_text = "reindexed";
1243 }
1244
1245 # Get the infodbtype value for this collection from the arcinfo object
1246 my $infodbtype = $archive_info->{'infodbtype'};
1247
1248 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1249 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1250
1251
1252 # record files marked for deletion in arcinfo
1253 foreach my $file (@$deleted_files) {
1254 # use 'archiveinf-src' info database file to look up all the OIDs
1255 # that this file is used in (note in most cases, it's just one OID)
1256
1257 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1258 my $oids = $src_rec->{'oid'};
1259 my $file_record_deleted = 0;
1260
1261 # delete the src record
1262 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1263 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1264 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1265
1266
1267 foreach my $oid (@$oids) {
1268
1269 # find the source doc (the primary file that becomes this oid)
1270 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1271 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1272 if (!&util::filename_is_absolute($doc_source_file)) {
1273 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1274 }
1275
1276 if ($doc_source_file ne $file) {
1277 # its an associated or metadata file
1278
1279 # mark source doc for reimport as one of its assoc files has changed or deleted
1280 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1281
1282 }
1283 my $curr_status = $archive_info->get_status_info($oid);
1284 if (defined($curr_status) && (($curr_status ne "D"))) {
1285 if ($verbosity>1) {
1286 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1287 }
1288 # mark oid for deletion (it will be deleted or reimported)
1289 $archive_info->set_status_info($oid,"D");
1290 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1291 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1292
1293 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1294 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1295
1296 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1297 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1298 }
1299 }
1300
1301 }
1302
1303 # now go through and check that we haven't marked any primary
1304 # files for reindex (because their associated files have
1305 # changed/deleted) when they have been deleted themselves. only in
1306 # delete mode.
1307
1308 if ($mode eq "delete") {
1309 foreach my $file (@$deleted_files) {
1310 if (defined $block_hash->{'reindex_files'}->{$file}) {
1311 delete $block_hash->{'reindex_files'}->{$file};
1312 }
1313 }
1314 }
1315
1316
1317}
1318
1319sub add_dir_contents_to_list {
1320
1321 my ($dirname, $list) = @_;
1322
1323 # Recur over directory contents.
1324 my (@dir, $subfile);
1325
1326 # find all the files in the directory
1327 if (!opendir (DIR, $dirname)) {
1328 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1329 return -1; # error in processing
1330 }
1331 @dir = readdir (DIR);
1332 closedir (DIR);
1333
1334 for (my $i = 0; $i < scalar(@dir); $i++) {
1335 my $subfile = $dir[$i];
1336 next if ($subfile =~ m/^\.\.?$/);
1337 next if ($subfile =~ /^\.svn$/);
1338 my $full_file = &util::filename_cat($dirname, $subfile);
1339 if (-d $full_file) {
1340 &add_dir_contents_to_list($full_file, $list);
1341 } else {
1342 push (@$list, $full_file);
1343 }
1344 }
1345
1346}
1347
1348
13491;
Note: See TracBrowser for help on using the repository browser.