source: gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm@ 24686

Last change on this file since 24686 was 24686, checked in by jmt12, 13 years ago

Several changes to ensure parallel importing plays nicely with manifest files and (simple) accompanying metadata.xml files. Also made it so initial calls to get_infodb_file_path run the GDBMServer (if necessary) to ensure it persists through parallel importing

File size: 41.2 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # add collection's perllib dir into include path in
176 # case we have collection specific modules
177 my $collection_perllib_path = $ENV{'GSDLCOLLECTDIR'} . '/perllib';
178 my $inc_paths = join(':', @INC);
179 if ($inc_paths !~ /$collection_perllib_path/)
180 {
181 unshift (@INC, $collection_perllib_path); # [jmt12]
182 }
183
184 # check that we can open the faillog
185 my $faillog = $self->{'faillog'};
186 if ($faillog eq "") {
187 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
188 }
189 open (FAILLOG, ">$faillog") ||
190 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
191
192
193 my $faillogname = $faillog;
194 $faillog = 'inexport::FAILLOG';
195 $faillog->autoflush(1);
196 $self->{'faillog'} = $faillog;
197 $self->{'faillogname'} = $faillogname;
198
199 # Read in the collection configuration file.
200 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
201 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
202
203 return ($config_filename,$collectcfg);
204}
205
206sub set_collection_options
207{
208 my $self = shift @_;
209 my ($collectcfg) = @_;
210
211 my $inexport_mode = $self->{'mode'};
212
213 my $verbosity = $self->{'verbosity'};
214 my $debug = $self->{'debug'};
215 my $importdir = $self->{'importdir'};
216 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
217 my $out = $self->{'out'};
218
219 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
220 if (!defined($collectcfg->{'infodbtype'}))
221 {
222 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
223 }
224 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
225 # we can't use the text version for archives dbs.
226 $collectcfg->{'infodbtype'} = "gdbm";
227 }
228 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
229 $importdir = $collectcfg->{'importdir'};
230 }
231 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
232 $archivedir = $collectcfg->{'archivedir'};
233 }
234 # fill in the default import and archives directories if none
235 # were supplied, turn all \ into / and remove trailing /
236 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
237 $importdir =~ s/[\\\/]+/\//g;
238 $importdir =~ s/\/$//;
239 if (!-e $importdir) {
240 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
241 die "\n";
242 }
243 $self->{'importdir'} = $importdir;
244
245 if ($archivedir eq "") {
246 if ($inexport_mode eq "import") {
247 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
248 }
249 elsif ($inexport_mode eq "export") {
250 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
251 }
252 else {
253 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
254 print STDERR " Defaulting to 'archives' for file output\n";
255 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
256 }
257 }
258
259 $archivedir =~ s/[\\\/]+/\//g;
260 $archivedir =~ s/\/$//;
261 $self->{'archivedir'} = $archivedir;
262
263 if ($verbosity !~ /\d+/) {
264 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
265 $verbosity = $collectcfg->{'verbosity'};
266 } else {
267 $verbosity = 2; # the default
268 }
269 }
270 $self->{'verbosity'} = $verbosity;
271
272 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
273 $self->{'manifest'} = $collectcfg->{'manifest'};
274 }
275
276 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
277 if ($collectcfg->{'gzip'} =~ /^true$/i) {
278 $self->{'gzip'} = 1;
279 }
280 }
281
282 if ($self->{'maxdocs'} !~ /\-?\d+/) {
283 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
284 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
285 } else {
286 $self->{'maxdocs'} = -1; # the default
287 }
288 }
289
290 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
291 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
292 $self->{'groupsize'} = $collectcfg->{'groupsize'};
293 }
294 }
295
296 if (!defined $self->{'OIDtype'}
297 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
298 if (defined $collectcfg->{'OIDtype'}
299 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
300 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
301 } else {
302 $self->{'OIDtype'} = "hash"; # the default
303 }
304 }
305
306 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
307 if (defined $collectcfg->{'OIDmetadata'}) {
308 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
309 } else {
310 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
311 }
312 }
313
314 my $sortmeta = $self->{'sortmeta'};
315 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
316 $sortmeta = $collectcfg->{'sortmeta'};
317 }
318 # sortmeta cannot be used with group size
319 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
320 if (defined $sortmeta && $self->{'groupsize'} > 1) {
321 &gsprintf($out, "{import.cannot_sort}\n\n");
322 $sortmeta = undef;
323 }
324 $self->{'sortmeta'} = $sortmeta;
325
326 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
327 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
328 }
329
330 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
331 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
332 }
333 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
334 $self->{'debug'} = 1;
335 }
336 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
337 $self->{'gli'} = 1;
338 }
339 $self->{'gli'} = 0 unless defined $self->{'gli'};
340
341 # check keepold and removeold
342 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
343
344 my ($removeold, $keepold, $incremental, $incremental_mode)
345 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
346 $self->{'incremental'}, $checkdir,
347 $collectcfg);
348
349 $self->{'removeold'} = $removeold;
350 $self->{'keepold'} = $keepold;
351 $self->{'incremental'} = $incremental;
352 $self->{'incremental_mode'} = $incremental_mode;
353
354 # Since this wasted my morning, let's at least warn a user that manifest
355 # files now *only* work if keepold is set. [jmt12]
356 if ($self->{'manifest'} && !$self->{'keepold'})
357 {
358 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental. Ignoring.\n";
359 }
360}
361
362sub process_files
363{
364 my $self = shift @_;
365 my ($config_filename,$collectcfg) = @_;
366
367 my $inexport_mode = $self->{'mode'};
368
369 my $verbosity = $self->{'verbosity'};
370 my $debug = $self->{'debug'};
371
372 my $importdir = $self->{'importdir'};
373 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
374
375 my $incremental = $self->{'incremental'};
376 my $incremental_mode = $self->{'incremental_mode'};
377
378 my $removeold = $self->{'removeold'};
379 my $keepold = $self->{'keepold'};
380
381 my $saveas = $self->{'saveas'};
382 my $OIDtype = $self->{'OIDtype'};
383 my $OIDmetadata = $self->{'OIDmetadata'};
384
385 my $out = $self->{'out'};
386 my $faillog = $self->{'faillog'};
387
388 my $maxdocs = $self->{'maxdocs'};
389 my $gzip = $self->{'gzip'};
390 my $groupsize = $self->{'groupsize'};
391 my $sortmeta = $self->{'sortmeta'};
392
393 my $removeprefix = $self->{'removeprefix'};
394 my $removesuffix = $self->{'removesuffix'};
395
396 my $gli = $self->{'gli'};
397
398 my $jobs = $self->{'jobs'};
399 my $epoch = $self->{'epoch'};
400
401 # related to export
402 my $xsltfile = $self->{'xsltfile'};
403 my $group_marc = $self->{'group_marc'};
404 my $mapping_file = $self->{'mapping_file'};
405 my $xslt_mets = $self->{'xslt_mets'};
406 my $xslt_txt = $self->{'xslt_txt'};
407 my $fedora_namespace = $self->{'fedora_namespace'};
408 my $metadata_prefix = $self->{'metadata_prefix'};
409
410 if ($inexport_mode eq "import") {
411 print STDERR "<Import>\n" if $gli;
412 }
413 else {
414 print STDERR "<export>\n" if $gli;
415 }
416
417 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
418 if ($self->{'manifest'} ne "") {
419 my $manifest_filename = $self->{'manifest'};
420
421 if (!&util::filename_is_absolute($manifest_filename)) {
422 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
423 }
424
425 $self->{'manifest'} =~ s/[\\\/]+/\//g;
426 $self->{'manifest'} =~ s/\/$//;
427
428 $manifest_lookup->parse($manifest_filename);
429 }
430
431 my $manifest = $self->{'manifest'};
432
433 # load all the plugins
434 my $plugins = [];
435 if (defined $collectcfg->{'plugin'}) {
436 $plugins = $collectcfg->{'plugin'};
437 }
438
439 my $plugin_incr_mode = $incremental_mode;
440 if ($manifest ne "") {
441 # if we have a manifest file, then we pretend we are fully incremental for plugins
442 $plugin_incr_mode = "all";
443 }
444 #some global options for the plugins
445 my @global_opts = ();
446
447 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
448 if (scalar(@$pluginfo) == 0) {
449 &gsprintf($out, "{import.no_plugins_loaded}\n");
450 die "\n";
451 }
452
453 # remove the old contents of the archives directory (and tmp
454 # directory) if needed
455
456 if ($removeold) {
457 if (-e $archivedir) {
458 &gsprintf($out, "{import.removing_archives}\n");
459 &util::rm_r ($archivedir);
460 }
461 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
462 $tmpdir =~ s/[\\\/]+/\//g;
463 $tmpdir =~ s/\/$//;
464 if (-e $tmpdir) {
465 &gsprintf($out, "{import.removing_tmpdir}\n");
466 &util::rm_r ($tmpdir);
467 }
468 }
469
470 # create the archives dir if needed
471 &util::mk_all_dir($archivedir);
472
473 # read the archive information file
474
475 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
476 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
477 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
478
479 # Warning! Black magic follows. When the following functions are called on
480 # the GDBMServer class they will actually prompt the running of the Server
481 # and attach themselves as a listener (even though they don't do anything)
482 # This is done so that, in parallel importing, the server will persist
483 # until the top level import.pl (which will be the first this that calls
484 # this function) completes.
485 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, 1);
486 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, 1);
487
488 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
489 $archive_info->load_info ($arcinfo_doc_filename);
490
491 if ($manifest eq "") {
492 # Load in list of files in import folder from last import (if present)
493 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
494 }
495
496 ####Use Plugout####
497 my $plugout;
498
499 if ($inexport_mode eq "import") {
500 if (defined $collectcfg->{'plugout'}) {
501 # If a plugout was specified in the collect.cfg file, assume it is sensible
502 # We can't check the name because it could be anything, if it is a custom plugout
503 $plugout = $collectcfg->{'plugout'};
504 }
505 else{
506 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
507 push @$plugout,"GreenstoneXMLPlugout";
508 }
509 else{
510 push @$plugout,$saveas."Plugout";
511 }
512 }
513 }
514 else {
515 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
516 $plugout = $collectcfg->{'plugout'};
517 }
518 else{
519 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
520 push @$plugout,"GreenstoneMETSPlugout";
521 }
522 else{
523 push @$plugout,$saveas."Plugout";
524 }
525 }
526 }
527
528 my $plugout_name = $plugout->[0];
529
530 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
531 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
532 push @$plugout,("-debug") if ($debug);
533 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
534 push @$plugout,("-gzip_output") if ($gzip);
535 push @$plugout,("-output_handle",$out) if (defined $out);
536
537 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
538
539 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
540 push @$plugout,("-group") if ($group_marc);
541 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
542 }
543 if ($plugout_name =~ m/^.*METSPlugout$/) {
544 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
545 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
546 }
547
548 if ($plugout_name eq "FedoraMETSPlugout") {
549 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
550 }
551
552 if ($plugout_name eq "DSpacePlugout") {
553 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
554 }
555
556 my $processor = &plugout::load_plugout($plugout);
557 $processor->setoutputdir ($archivedir);
558 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
559 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
560
561 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
562
563 if ($removeold) {
564 # occasionally, plugins may want to do something on remove
565 # old, eg pharos image indexing
566 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
567 }
568
569 # process the import directory
570 my $block_hash = {};
571 $block_hash->{'new_files'} = {};
572 $block_hash->{'reindex_files'} = {};
573 # All of these are set somewhere else, so it's kinda nice to define them
574 # here. [jmt12]
575 $block_hash->{'all_files'} = {};
576 $block_hash->{'deleted_files'} = {};
577 $block_hash->{'file_blocks'} = {};
578 $block_hash->{'metadata_files'} = {};
579 $block_hash->{'shared_fileroot'} = '';
580 # My new flag so we can tell we had a manifest way down in the plugins
581 # [jmt12]
582 $block_hash->{'manifest'} = 'false';
583 my $metadata = {};
584
585 # global blocking pass may set up some metadata
586 # - when we have a manifest file we don't do this -unless- the collection
587 # configuration indicates this collection contains complex (inherited)
588 # metadata. [jmt12]
589 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
590 {
591 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
592 }
593 else
594 {
595 print "Skipping global file scan due to manifest and complexmeta configuration\n";
596 }
597
598 if ($manifest ne "") {
599
600 $block_hash->{'manifest'} = 'true';
601
602 #
603 # 1. Process delete files first
604 #
605 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
606 my @full_deleted_files = ();
607
608 # ensure all filenames are absolute
609 foreach my $df (@deleted_files) {
610 my $full_df =
611 (&util::filename_is_absolute($df))
612 ? $df
613 : &util::filename_cat($importdir,$df);
614
615 if (-d $full_df) {
616 &add_dir_contents_to_list($full_df, \@full_deleted_files);
617 } else {
618 push(@full_deleted_files,$full_df);
619 }
620 }
621
622 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
623 mark_docs_for_deletion($archive_info,{},
624 \@full_deleted_files,
625 $archivedir, $verbosity, "delete");
626
627
628 #
629 # 2. Now files for reindexing
630 #
631
632 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
633 my @full_reindex_files = ();
634 # ensure all filenames are absolute
635 foreach my $rf (@reindex_files) {
636 my $full_rf =
637 (&util::filename_is_absolute($rf))
638 ? $rf
639 : &util::filename_cat($importdir,$rf);
640
641 if (-d $full_rf) {
642 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
643 } else {
644 push(@full_reindex_files,$full_rf);
645 }
646 }
647
648 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
649 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
650
651 # And now to ensure the new version of the file processed by
652 # appropriate plugin, we need to add it to block_hash reindex list
653 foreach my $full_rf (@full_reindex_files) {
654 $block_hash->{'reindex_files'}->{$full_rf} = 1;
655 }
656
657
658 #
659 # 3. Now finally any new files - add to block_hash new_files list
660 #
661
662 my @new_files = keys %{$manifest_lookup->{'index'}};
663 my @full_new_files = ();
664
665 foreach my $nf (@new_files) {
666 # ensure filename is absolute
667 my $full_nf =
668 (&util::filename_is_absolute($nf))
669 ? $nf
670 : &util::filename_cat($importdir,$nf);
671
672 if (-d $full_nf) {
673 &add_dir_contents_to_list($full_nf, \@full_new_files);
674 } else {
675 push(@full_new_files,$full_nf);
676 }
677 }
678
679 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
680 # need to check this file exists before trying to read it. [jmt12]
681 if (-e $arcinfo_src_filename)
682 {
683 my $arcinfodb_map = {};
684 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
685 foreach my $f (@full_new_files) {
686 # check that we haven't seen it already
687 if (defined $arcinfodb_map->{$f}) {
688 # TODO make better warning
689 print STDERR "Warning: $f already in src archive, \n";
690 } else {
691 $block_hash->{'new_files'}->{$f} = 1;
692 }
693 }
694 undef $arcinfodb_map;
695 }
696 # no existing files - so we can just add all the ones that need adding.
697 # [jmt12]
698 else
699 {
700 foreach my $f (@full_new_files)
701 {
702 $block_hash->{'new_files'}->{$f} = 1;
703 }
704 }
705
706 # If we are not using complex inherited metadata (and thus have skipped
707 # the global file scan) we need to at least scan the directory of the
708 # files being indexed/reindexed. [jmt12]
709 if ($collectcfg->{'complexmeta'} ne 'true')
710 {
711 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
712 foreach my $file_to_import (@all_files_to_import)
713 {
714 my $dir_to_import = $file_to_import;
715 $dir_to_import =~ s/[^\\\/]*$//;
716 # - one day we may need to manually scan this directory for child
717 # directories and somehow explicitly block them from being
718 # recursed.
719 if (-d $dir_to_import)
720 {
721 &plugin::file_block_read($pluginfo, $dir_to_import, '', $block_hash, $metadata, $gli);
722 }
723 }
724 }
725 }
726 else {
727 # if incremental, we read through the import folder to see whats changed.
728
729 if ($incremental || $incremental_mode eq "onlyadd") {
730 prime_doc_oid_count($archivedir);
731
732 # Can now work out which files were new, already existed, and have
733 # been deleted
734
735 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
736 $archivedir,$verbosity,$incremental_mode);
737
738 my @new_files = sort keys %{$block_hash->{'new_files'}};
739 if (scalar(@new_files>0)) {
740 print STDERR "New files and modified metadata files since last import:\n ";
741 print STDERR join("\n ",@new_files), "\n";
742 }
743
744 if ($incremental) {
745 # only look for deletions if we are truely incremental
746 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
747 # Filter out any in gsdl/tmp area
748 my @filtered_deleted_files = ();
749 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
750 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
751 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
752 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
753
754 foreach my $df (@deleted_files) {
755 next if ($df =~ m/^$gsdl_tmp_area/);
756 next if ($df =~ m/^$collect_tmp_area/);
757
758 push(@filtered_deleted_files,$df);
759 }
760
761
762 @deleted_files = @filtered_deleted_files;
763
764 if (scalar(@deleted_files)>0) {
765 print STDERR "Files deleted since last import:\n ";
766 print STDERR join("\n ",@deleted_files), "\n";
767
768
769 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
770
771 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
772 }
773
774 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
775
776 if (scalar(@reindex_files)>0) {
777 print STDERR "Files to reindex since last import:\n ";
778 print STDERR join("\n ",@reindex_files), "\n";
779 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
780 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
781 }
782
783 }
784 }
785 }
786
787 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
788 # Do nothing if the file already exists (file exists on incremental build).
789 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
790 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
791 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
792 # oailastmodified and oailastmodifieddate
793 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
794 if (!-f $earliestDatestampFile && -d $archivedir) {
795 my $current_time_in_seconds = time; # in seconds
796
797 if(open(FOUT, ">$earliestDatestampFile")) {
798 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
799 print FOUT $current_time_in_seconds;
800 close(FOUT);
801 }
802 else {
803 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
804 }
805
806 }
807
808 # now, whichever mode we are in, we can process the entire import folder
809 if ((defined $jobs) && ($jobs > 1))
810 {
811 # if jobs are set to >1, run in parallel using MPI helper
812 # [hs, 1 july 2010]
813 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
814 $self->{'collection'}, $self->{'site'});
815 }
816 else
817 {
818 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
819 }
820
821
822 if ($saveas eq "FedoraMETS") {
823 # create collection "doc obj" for Fedora that contains
824 # collection-level metadata
825
826 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
827 $doc_obj->set_OID("collection");
828
829 my $col_name = undef;
830 my $col_meta = $collectcfg->{'collectionmeta'};
831
832 if (defined $col_meta) {
833 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
834 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
835 }
836 $processor->process($doc_obj);
837 }
838
839 &plugin::end($pluginfo, $processor);
840
841 &plugin::deinit($pluginfo, $processor);
842
843 # Store the value of OIDCount (used in doc.pm) so it can be
844 # restored correctly to this value on an incremental build
845 store_doc_oid_count($archivedir);
846
847 # write out the archive information file
848 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
849 $processor->close_group_output() if $processor->is_group();
850
851 # for backwards compatability with archvies.inf file
852 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
853 $archive_info->save_info($arcinfo_doc_filename);
854 }
855 else {
856 $archive_info->save_revinfo_db($arcinfo_src_filename);
857 }
858
859 return $pluginfo;
860}
861
862
863sub generate_statistics
864{
865 my $self = shift @_;
866 my ($pluginfo) = @_;
867
868 my $inexport_mode = $self->{'mode'};
869
870 my $statsfile = $self->{'statsfile'};
871 my $out = $self->{'out'};
872 my $faillogname = $self->{'faillogname'};
873 my $gli = $self->{'gli'};
874 my $jobs = $self->{'jobs'};
875
876 # write out import stats
877
878 if ((!defined $jobs) || ($jobs == 1))
879 {
880 # only output statistics if there are multiple jobs
881 # [hs, 1 july 2010]
882
883 my $close_stats = 0;
884 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
885 if (open (STATS, ">$statsfile")) {
886 $statsfile = 'inexport::STATS';
887 $close_stats = 1;
888 } else {
889 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
890 &gsprintf($out, "{import.stats_backup}\n");
891 $statsfile = 'STDERR';
892 }
893 }
894
895 &gsprintf($out, "\n");
896 &gsprintf($out, "*********************************************\n");
897 &gsprintf($out, "{$inexport_mode.complete}\n");
898 &gsprintf($out, "*********************************************\n");
899
900 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
901 if ($close_stats) {
902 close STATS;
903 }
904 }
905
906 close OUT if $self->{'close_out'};
907 close FAILLOG;
908}
909
910
911sub store_collectionmeta
912{
913 my ($collectionmeta,$field,$doc_obj) = @_;
914
915 my $section = $doc_obj->get_top_section();
916
917 my $field_hash = $collectionmeta->{$field};
918
919 foreach my $k (keys %$field_hash)
920 {
921 my $val = $field_hash->{$k};
922
923 ### print STDERR "*** $k = $field_hash->{$k}\n";
924
925 my $md_label = "ex.$field";
926
927
928 if ($k =~ m/^\[l=(.*?)\]$/)
929 {
930
931 my $md_suffix = $1;
932 $md_label .= "^$md_suffix";
933 }
934
935
936 $doc_obj->add_utf8_metadata($section,$md_label, $val);
937
938 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
939 # while "collectionname" in GS2 is called "name" in GS3.
940 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
941 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
942 {
943 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
944 }
945
946 }
947}
948
949
950sub oid_count_file {
951 my ($archivedir) = @_;
952 return &util::filename_cat ($archivedir, "OIDcount");
953}
954
955
956sub prime_doc_oid_count
957{
958 my ($archivedir) = @_;
959 my $oid_count_filename = &oid_count_file($archivedir);
960
961 if (-e $oid_count_filename) {
962 if (open(OIDIN,"<$oid_count_filename")) {
963 my $OIDcount = <OIDIN>;
964 chomp $OIDcount;
965 close(OIDIN);
966
967 $doc::OIDcount = $OIDcount;
968 }
969 else {
970 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
971 }
972 }
973
974}
975
976sub store_doc_oid_count
977{
978 # Use the file "OIDcount" in the archives directory to record
979 # what value doc.pm got up to
980
981 my ($archivedir) = @_;
982 my $oid_count_filename = &oid_count_file($archivedir);
983
984
985 if (open(OIDOUT,">$oid_count_filename")) {
986 print OIDOUT $doc::OIDcount, "\n";
987
988 close(OIDOUT);
989 }
990 else {
991 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
992 }
993}
994
995
996
997sub new_vs_old_import_diff
998{
999 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1000
1001 # Get the infodbtype value for this collection from the arcinfo object
1002 my $infodbtype = $archive_info->{'infodbtype'};
1003
1004 # in this method, we want to know if metadata files are modified or not.
1005 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1006
1007 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1008
1009 # First convert all files to absolute form
1010 # This is to support the situation where the import folder is not
1011 # the default
1012
1013 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1014 my $full_prev_all_files = {};
1015
1016 foreach my $prev_file (keys %$prev_all_files) {
1017
1018 if (!&util::filename_is_absolute($prev_file)) {
1019 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
1020 $full_prev_all_files->{$full_prev_file} = $prev_file;
1021 }
1022 else {
1023 $full_prev_all_files->{$prev_file} = $prev_file;
1024 }
1025 }
1026
1027
1028 # Figure out which are the new files, existing files and so
1029 # by implication the files from the previous import that are not
1030 # there any more => mark them for deletion
1031 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1032
1033 my $full_curr_file = $curr_file;
1034
1035 # entry in 'all_files' is moved to either 'existing_files',
1036 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1037
1038 if (!&util::filename_is_absolute($curr_file)) {
1039 # add in import dir to make absolute
1040 $full_curr_file = &util::filename_cat($importdir,$curr_file);
1041 }
1042
1043 # figure out if new file or not
1044 if (defined $full_prev_all_files->{$full_curr_file}) {
1045 # delete it so that only files that need deleting are left
1046 delete $full_prev_all_files->{$full_curr_file};
1047
1048 # had it before. is it a metadata file?
1049 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1050
1051 # is it modified??
1052 if (-M $full_curr_file < $archiveinf_timestamp) {
1053 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1054 # its newer than last build
1055 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1056 }
1057 }
1058 else {
1059 if ($incremental_mode eq "all") {
1060
1061 # had it before
1062 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1063
1064 }
1065 else {
1066 # Warning in "onlyadd" mode, but had it before!
1067 print STDERR "Warning: File $full_curr_file previously imported.\n";
1068 print STDERR " Treating as new file\n";
1069
1070 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1071
1072 }
1073 }
1074 }
1075 else {
1076 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1077 # the new file is the special sort of file greenstone uses
1078 # to attach metadata to src documents
1079 # i.e metadata.xml
1080 # (but note, the filename used is not constrained in
1081 # Greenstone to always be this)
1082
1083 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1084 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1085 }
1086 else {
1087 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1088 }
1089 }
1090
1091
1092 delete $block_hash->{'all_files'}->{$curr_file};
1093 }
1094
1095
1096
1097
1098 # Deal with complication of new or modified metadata files by forcing
1099 # everything from this point down in the file hierarchy to
1100 # be freshly imported.
1101 #
1102 # This may mean files that have not changed are reindexed, but does
1103 # guarantee by the end of processing all new metadata is correctly
1104 # associated with the relevant document(s).
1105
1106 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1107 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1108
1109 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1110 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
1111
1112 # Go through existing_files, and mark anything that is contained
1113 # within 'situated_dir' to be reindexed (in case some of the metadata
1114 # attaches to one of these files)
1115
1116 my $reindex_files = [];
1117
1118 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1119
1120 if ($existing_f =~ m/^$situated_dir/) {
1121
1122 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1123
1124 push(@$reindex_files,$existing_f);
1125 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1126 delete $block_hash->{'existing_files'}->{$existing_f};
1127
1128 }
1129 }
1130
1131 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1132 # (or equivalent)
1133 $block_hash->{'new_files'}->{$new_mdf} = 1;
1134
1135 }
1136
1137 # go through remaining existing files and work out what has changed and needs to be reindexed.
1138 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1139
1140 my $reindex_files = [];
1141
1142 foreach my $existing_filename (@existing_files) {
1143 if (-M $existing_filename < $archiveinf_timestamp) {
1144 # file is newer than last build
1145
1146 my $existing_file = $existing_filename;
1147 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1148
1149 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1150 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1151
1152 print STDERR "**** Reindexing existing file: $existing_file\n";
1153
1154 push(@$reindex_files,$existing_file);
1155 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1156 }
1157
1158 }
1159
1160
1161 # By this point full_prev_all_files contains the files
1162 # mentioned in archiveinf-src.db but are not in the 'import'
1163 # folder (or whatever was specified through -importdir ...)
1164
1165 # This list can contain files that were created in the 'tmp' or
1166 # 'cache' areas (such as screen-size and thumbnail images).
1167 #
1168 # In building the final list of files to delete, we test to see if
1169 # it exists on the filesystem and if it does (unusual for a "normal"
1170 # file in import, but possible in the case of 'tmp' files),
1171 # supress it from going into the final list
1172
1173 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1174
1175 my @deleted_files = values %$full_prev_all_files;
1176 map { my $curr_file = $_;
1177 my $full_curr_file = $curr_file;
1178
1179 if (!&util::filename_is_absolute($curr_file)) {
1180 # add in import dir to make absolute
1181
1182 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1183 }
1184
1185
1186 if (!-e $full_curr_file) {
1187 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1188 }
1189 } @deleted_files;
1190
1191
1192
1193}
1194
1195
1196# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1197# $mode is 'delete' or 'reindex'
1198sub mark_docs_for_deletion
1199{
1200 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1201
1202 my $mode_text = "deleted from index";
1203 if ($mode eq "reindex") {
1204 $mode_text = "reindexed";
1205 }
1206
1207 # Get the infodbtype value for this collection from the arcinfo object
1208 my $infodbtype = $archive_info->{'infodbtype'};
1209
1210 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1211 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1212
1213
1214 # record files marked for deletion in arcinfo
1215 foreach my $file (@$deleted_files) {
1216 # use 'archiveinf-src' info database file to look up all the OIDs
1217 # that this file is used in (note in most cases, it's just one OID)
1218
1219 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1220 my $oids = $src_rec->{'oid'};
1221 my $file_record_deleted = 0;
1222
1223 # delete the src record
1224 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1225 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1226 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1227
1228
1229 foreach my $oid (@$oids) {
1230
1231 # find the source doc (the primary file that becomes this oid)
1232 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1233 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1234 if (!&util::filename_is_absolute($doc_source_file)) {
1235 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1236 }
1237
1238 if ($doc_source_file ne $file) {
1239 # its an associated or metadata file
1240
1241 # mark source doc for reimport as one of its assoc files has changed or deleted
1242 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1243
1244 }
1245 my $curr_status = $archive_info->get_status_info($oid);
1246 if (defined($curr_status) && (($curr_status ne "D"))) {
1247 if ($verbosity>1) {
1248 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1249 }
1250 # mark oid for deletion (it will be deleted or reimported)
1251 $archive_info->set_status_info($oid,"D");
1252 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1253 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1254
1255 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1256 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1257
1258 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1259 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1260 }
1261 }
1262
1263 }
1264
1265 # now go through and check that we haven't marked any primary
1266 # files for reindex (because their associated files have
1267 # changed/deleted) when they have been deleted themselves. only in
1268 # delete mode.
1269
1270 if ($mode eq "delete") {
1271 foreach my $file (@$deleted_files) {
1272 if (defined $block_hash->{'reindex_files'}->{$file}) {
1273 delete $block_hash->{'reindex_files'}->{$file};
1274 }
1275 }
1276 }
1277
1278
1279}
1280
1281sub add_dir_contents_to_list {
1282
1283 my ($dirname, $list) = @_;
1284
1285 # Recur over directory contents.
1286 my (@dir, $subfile);
1287
1288 # find all the files in the directory
1289 if (!opendir (DIR, $dirname)) {
1290 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1291 return -1; # error in processing
1292 }
1293 @dir = readdir (DIR);
1294 closedir (DIR);
1295
1296 for (my $i = 0; $i < scalar(@dir); $i++) {
1297 my $subfile = $dir[$i];
1298 next if ($subfile =~ m/^\.\.?$/);
1299 next if ($subfile =~ /^\.svn$/);
1300 my $full_file = &util::filename_cat($dirname, $subfile);
1301 if (-d $full_file) {
1302 &add_dir_contents_to_list($full_file, $list);
1303 } else {
1304 push (@$list, $full_file);
1305 }
1306 }
1307
1308}
1309
1310
13111;
Note: See TracBrowser for help on using the repository browser.