root/gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm @ 26932

Revision 26932, 42.9 KB (checked in by jmt12, 7 years ago)

Altered all calls to built-in perl file tests to instead use util library ones. This allows better awareness of HDFS or other strange file paths. Added support for newer version of manifest (where files are followed verbatim). Only write OIDcount for numerical OID collections

Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52    my $class = shift (@_);
53    my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55    my $self = { 'xml' => 0, 'mode' => $mode };
56
57    # general options available to all plugins
58    my $arguments = $options->{'args'};
59    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60    # Parse returns -1 if something has gone wrong
61    if ($intArgLeftinAfterParsing == -1)
62    {
63    &PrintUsage::print_txt_usage($options, "{import.params}");
64    die "\n";
65    }
66   
67    my $language = $self->{'language'};
68    # If $language has been specified, load the appropriate resource bundle
69    # (Otherwise, the default resource bundle will be loaded automatically)
70    if ($language && $language =~ /\S/) {
71    &gsprintf::load_language_specific_resource_bundle($language);
72    }
73
74    if ($self->{'listall'}) {
75    if ($self->{'xml'}) {
76        &PrintUsage::print_xml_usage($opt_listall_options);
77    }
78    else
79    {
80        &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81    }
82    die "\n";
83    }
84
85
86    if ($self->{'xml'}) {
87        &PrintUsage::print_xml_usage($options);
88    print "\n";
89    return bless $self, $class;
90    }
91
92    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93    &gsprintf::output_strings_in_UTF8;
94    }
95   
96    # now check that we had exactly one leftover arg, which should be
97    # the collection name. We don't want to do this earlier, cos
98    # -xml arg doesn't need a collection name
99    # Or if the user specified -h, then we output the usage also
100
101    if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102    {
103    &PrintUsage::print_txt_usage($options, "{import.params}");
104    die "\n";
105    }
106
107    $self->{'close_out'} = 0;
108    my $out = $self->{'out'};
109    if ($out !~ /^(STDERR|STDOUT)$/i) {
110    open (OUT, ">$out") ||
111        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112    $out = 'inexport::OUT';
113    $self->{'close_out'} = 1;
114    }
115    $out->autoflush(1);
116    $self->{'out'} = $out;
117
118    # @ARGV should be only one item, the name of the collection
119    $self->{'collection'} = shift @$argv;
120
121    if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122    require ParallelInexport;
123    }
124
125    return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131    my $class = shift (@_);
132    my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134    my $self = { 'xml' => 0, 'mode' => $mode };
135
136    $self->{'out'} = STDERR;
137   
138    if (defined $gsdl_cgi) {
139        $self->{'site'} = $opt_site;
140        my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141        $self->{'collectdir'} = $collect_dir;
142    }
143    else { 
144        $self->{'site'} = "";
145        $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146    }
147    $self->{'faillog'} = "";
148   
149    $self->{'collection'} = $collect;
150
151    return bless $self, $class;
152}
153sub get_collection
154{
155    my $self = shift @_;
156   
157    return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163    my $self = shift @_;
164    my ($collection,$options) = @_;
165
166    my $collectdir = $self->{'collectdir'};
167    my $site       = $self->{'site'};
168    my $out        = $self->{'out'};
169     
170    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171    &PrintUsage::print_txt_usage($options, "{import.params}");
172    die "\n";
173    }
174
175    # add collection's perllib dir  into include path in
176    # case we have collection specific modules
177    my $collection_perllib_path = $ENV{'GSDLCOLLECTDIR'} . '/perllib';
178    my $inc_paths = join(':', @INC);
179    if ($inc_paths !~ /$collection_perllib_path/)
180    {
181      unshift (@INC, $collection_perllib_path); # [jmt12]
182    }
183
184    # check that we can open the faillog
185    my $faillog = $self->{'faillog'};
186    if ($faillog eq "") {
187    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
188    }
189    open (FAILLOG, ">$faillog") ||
190    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
191
192   
193    my $faillogname = $faillog;
194    $faillog = 'inexport::FAILLOG';
195    $faillog->autoflush(1);
196    $self->{'faillog'} = $faillog;
197    $self->{'faillogname'} = $faillogname;
198
199    # Read in the collection configuration file.
200    my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
201    my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
202
203    return ($config_filename,$collectcfg);
204}
205
206sub set_collection_options
207{
208    my $self = shift @_;
209    my ($collectcfg) = @_;
210
211    my $inexport_mode = $self->{'mode'};
212
213    my $verbosity  = $self->{'verbosity'};
214    my $debug      = $self->{'debug'};
215    my $importdir  = $self->{'importdir'};
216    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
217    my $out        = $self->{'out'};
218
219    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
220    if (!defined($collectcfg->{'infodbtype'}))
221    {
222      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
223    }
224    if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
225    # we can't use the text version for archives dbs.
226    $collectcfg->{'infodbtype'} = "gdbm";
227    }
228    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
229    $importdir = $collectcfg->{'importdir'};
230    }
231    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
232    $archivedir = $collectcfg->{'archivedir'};
233    }
234    # fill in the default import and archives directories if none
235    # were supplied, turn all \ into / and remove trailing /
236    if ($importdir eq "")
237    {
238      $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import");
239    }
240    else
241    {
242      # hijack filename_cat to sanitize the user provided importdir [jmt12]
243      $importdir = &util::filename_cat ($importdir);
244    }
245    if (!&util::dir_exists($importdir)) {
246    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
247    die "\n";
248    }
249    $self->{'importdir'} = $importdir;
250
251    if ($archivedir eq "") {
252    if ($inexport_mode eq "import") {
253        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
254    }
255    elsif ($inexport_mode eq "export") {
256        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
257    }
258    else {
259        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
260        print STDERR "         Defaulting to 'archives' for file output\n";
261        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
262    }
263    }
264    else
265    {
266      # use filename_cat() to sanitize the user provided archive directory as
267      # it is more aware of protocols etc
268      $archivedir = &util::filename_cat($archivedir);
269    }
270    $self->{'archivedir'} = $archivedir;
271
272    if ($verbosity !~ /\d+/) {
273    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
274        $verbosity = $collectcfg->{'verbosity'};
275    } else {
276        $verbosity = 2; # the default
277    }
278    }
279    $self->{'verbosity'} = $verbosity;
280
281    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
282    $self->{'manifest'} = $collectcfg->{'manifest'};
283    }
284    # Default value
285    $self->{'manifest_version'} = 0;
286
287    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
288    if ($collectcfg->{'gzip'} =~ /^true$/i) {
289        $self->{'gzip'} = 1;
290    }
291    }
292
293    if ($self->{'maxdocs'} !~ /\-?\d+/) {
294    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
295        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
296    } else {
297        $self->{'maxdocs'} = -1; # the default
298    }
299    }
300
301    if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
302    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
303        $self->{'groupsize'} = $collectcfg->{'groupsize'};
304    }
305    }
306
307    if (!defined $self->{'OIDtype'}
308    || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
309    if (defined $collectcfg->{'OIDtype'}
310        && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
311        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
312    } else {
313        $self->{'OIDtype'} = "hash"; # the default
314    }
315    }
316
317    if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
318    if (defined $collectcfg->{'OIDmetadata'}) {
319        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
320    } else {
321        $self->{'OIDmetadata'} = "dc.Identifier"; # the default
322    }
323    }
324
325    my $sortmeta = $self->{'sortmeta'};
326    if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
327    $sortmeta = $collectcfg->{'sortmeta'};
328    }
329    # sortmeta cannot be used with group size
330    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
331    if (defined $sortmeta && $self->{'groupsize'} > 1) {
332    &gsprintf($out, "{import.cannot_sort}\n\n");
333    $sortmeta = undef;
334    }
335    $self->{'sortmeta'} = $sortmeta;
336
337    if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
338    $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
339    }
340   
341    if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
342    $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
343    }
344    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
345    $self->{'debug'} = 1;
346    }
347    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
348    $self->{'gli'} = 1;
349    }
350    $self->{'gli'} = 0 unless defined $self->{'gli'};
351       
352    # check keepold and removeold
353    my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
354
355    my ($removeold, $keepold, $incremental, $incremental_mode)
356    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
357                           $self->{'incremental'}, $checkdir,
358                           $collectcfg);
359
360    $self->{'removeold'}        = $removeold;
361    $self->{'keepold'}          = $keepold;
362    $self->{'incremental'}      = $incremental;
363    $self->{'incremental_mode'} = $incremental_mode;
364
365    # Since this wasted my morning, let's at least warn a user that manifest
366    # files now *only* work if keepold is set. [jmt12]
367    if ($self->{'manifest'} && !$self->{'keepold'})
368    {
369      print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental. Ignoring.\n";
370    }
371}
372
373sub process_files
374{
375    my $self = shift @_;
376    my ($config_filename,$collectcfg) = @_;
377
378    my $inexport_mode = $self->{'mode'};
379
380    my $verbosity   = $self->{'verbosity'};
381    my $debug       = $self->{'debug'};
382
383    my $importdir   = $self->{'importdir'};
384    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
385
386    my $incremental = $self->{'incremental'};
387    my $incremental_mode = $self->{'incremental_mode'};
388
389    my $removeold   = $self->{'removeold'};
390    my $keepold     = $self->{'keepold'};
391
392    my $saveas      = $self->{'saveas'};
393    my $OIDtype     = $self->{'OIDtype'};
394    my $OIDmetadata = $self->{'OIDmetadata'};
395
396    my $out         = $self->{'out'};
397    my $faillog     = $self->{'faillog'};
398
399    my $maxdocs     = $self->{'maxdocs'};
400    my $gzip        = $self->{'gzip'};
401    my $groupsize   = $self->{'groupsize'};
402    my $sortmeta    = $self->{'sortmeta'};
403
404    my $removeprefix = $self->{'removeprefix'};
405    my $removesuffix = $self->{'removesuffix'};
406
407    my $gli          = $self->{'gli'};
408
409    my $jobs         = $self->{'jobs'};
410    my $epoch        = $self->{'epoch'};
411
412    # related to export
413    my $xsltfile         = $self->{'xsltfile'};
414    my $group_marc       = $self->{'group_marc'};
415    my $mapping_file     = $self->{'mapping_file'};
416    my $xslt_mets        = $self->{'xslt_mets'};
417    my $xslt_txt         = $self->{'xslt_txt'};
418    my $fedora_namespace = $self->{'fedora_namespace'};
419    my $metadata_prefix  = $self->{'metadata_prefix'};
420
421    if ($inexport_mode eq "import") {
422    print STDERR "<Import>\n" if $gli;
423    }
424    else {
425    print STDERR "<export>\n" if $gli;
426    }
427
428    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
429    if ($self->{'manifest'} ne "") {
430    my $manifest_filename = $self->{'manifest'};
431
432    if (!&util::filename_is_absolute($manifest_filename)) {
433        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
434    }
435
436    $self->{'manifest'} =~ s/[\\\/]+/\//g;
437    $self->{'manifest'} =~ s/\/$//;
438
439    $manifest_lookup->parse($manifest_filename);
440
441        # Manifests may now include a version number
442        $self->{'manifest_version'} = $manifest_lookup->get_version();
443    }
444
445    my $manifest = $self->{'manifest'};
446
447    # load all the plugins
448    my $plugins = [];
449    if (defined $collectcfg->{'plugin'}) {
450    $plugins = $collectcfg->{'plugin'};
451    }
452
453    my $plugin_incr_mode = $incremental_mode;
454    if ($manifest ne "") {
455    # if we have a manifest file, then we pretend we are fully incremental for plugins
456    $plugin_incr_mode = "all";
457    }
458    #some global options for the plugins
459    my @global_opts = ();
460
461    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
462    if (scalar(@$pluginfo) == 0) {
463    &gsprintf($out, "{import.no_plugins_loaded}\n");
464    die "\n";
465    }
466
467    # remove the old contents of the archives directory (and tmp
468    # directory) if needed
469
470    if ($removeold) {
471    if (&util::dir_exists($archivedir)) {
472        &gsprintf($out, "{import.removing_archives}\n");
473        &util::rm_r ($archivedir);
474    }
475    my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
476    $tmpdir =~ s/[\\\/]+/\//g;
477    $tmpdir =~ s/\/$//;
478    if (&util::dir_exists($tmpdir)) {
479        &gsprintf($out, "{import.removing_tmpdir}\n");
480        &util::rm_r ($tmpdir);
481    }
482    }
483
484    # create the archives dir if needed
485    &util::mk_all_dir($archivedir);
486
487    # read the archive information file
488
489    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
490    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
491    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
492
493    # Warning! Black magic follows. When the following functions are called on
494    # the GDBMServer class they will actually prompt the running of the Server
495    # and attach themselves as a listener (even though they don't do anything)
496    # This is done so that, in parallel importing, the server will persist
497    # until the top level import.pl (which will be the first that calls this
498    # function) completes. [jmt12]
499    my $create_server = 0;
500    # - infodb's of type *server need to be started on the same machine that
501    #   runs parallel-import.sh, especially when distributed processing
502    #   gets involved.
503    if ($collectcfg->{'infodbtype'} =~ /server$/)
504    {
505      $create_server = 1;
506    }
507    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $create_server);
508    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $create_server);
509
510    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
511    $archive_info->load_info ($arcinfo_doc_filename);
512
513    if ($manifest eq "") {
514    # Load in list of files in import folder from last import (if present)
515    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
516    }
517
518    ####Use Plugout####
519    my $plugout;
520
521    if ($inexport_mode eq "import") {
522    if (defined $collectcfg->{'plugout'}) {
523        # If a plugout was specified in the collect.cfg file, assume it is sensible
524        # We can't check the name because it could be anything, if it is a custom plugout
525        $plugout = $collectcfg->{'plugout'};
526    }
527    else{
528        if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
529        push @$plugout,"GreenstoneXMLPlugout";
530        }
531        else{
532        push @$plugout,$saveas."Plugout";
533        }
534    }
535    }
536    else {
537    if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
538        $plugout = $collectcfg->{'plugout'};
539    }
540    else{
541        if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
542        push @$plugout,"GreenstoneMETSPlugout";
543        }
544        else{
545        push @$plugout,$saveas."Plugout";
546        }
547    }
548    }
549   
550    my $plugout_name = $plugout->[0];
551
552    push @$plugout,("-output_info",$archive_info)  if (defined $archive_info);
553    push @$plugout,("-verbosity",$verbosity)       if (defined $verbosity);
554    push @$plugout,("-debug")                      if ($debug);
555    push @$plugout,("-group_size",$groupsize)      if (defined $groupsize);
556    push @$plugout,("-gzip_output")                if ($gzip);
557    push @$plugout,("-output_handle",$out)         if (defined $out);
558
559    push @$plugout,("-xslt_file",$xsltfile)        if (defined $xsltfile && $xsltfile ne "");
560
561    if ($plugout_name =~ m/^MARCXMLPlugout$/) {
562    push @$plugout,("-group")                      if ($group_marc);
563    push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
564    }
565    if ($plugout_name =~ m/^.*METSPlugout$/) {
566    push @$plugout,("-xslt_mets",$xslt_mets)       if (defined $xslt_mets && $xslt_mets ne "");
567    push @$plugout,("-xslt_txt",$xslt_txt)         if (defined $xslt_txt && $xslt_txt ne "");
568    }
569
570    if ($plugout_name eq "FedoraMETSPlugout") {
571    push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
572    }
573   
574    if ($plugout_name eq "DSpacePlugout") {
575    push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");   
576    }
577
578    my $processor = &plugout::load_plugout($plugout);
579    $processor->setoutputdir ($archivedir);
580    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
581    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
582   
583    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
584   
585    if ($removeold) {
586        # occasionally, plugins may want to do something on remove
587        # old, eg pharos image indexing
588    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
589    }
590
591    # process the import directory
592    my $block_hash = {};
593    $block_hash->{'new_files'} = {};
594    $block_hash->{'reindex_files'} = {};
595    # All of these are set somewhere else, so it's kinda nice to define them
596    # here. [jmt12]
597    $block_hash->{'all_files'} = {};
598    $block_hash->{'deleted_files'} = {};
599    $block_hash->{'file_blocks'} = {};
600    $block_hash->{'metadata_files'} = {};
601    $block_hash->{'shared_fileroot'} = '';
602    # My new flag so we can tell we had a manifest way down in the plugins
603    # [jmt12]
604    $block_hash->{'manifest'} = 'false';
605    my $metadata = {};
606   
607    # global blocking pass may set up some metadata
608    # - when we have a manifest file we don't do this -unless- the collection
609    #   configuration indicates this collection contains complex (inherited)
610    #   metadata. [jmt12]
611    if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
612    {
613      #print "Temporarily skipping global file scan as it is slow as molasses\n";
614      &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
615    }
616    else
617    {
618      print "Skipping global file scan due to manifest and complexmeta configuration\n";
619    }
620   
621    if ($manifest ne "") {
622
623      $block_hash->{'manifest'} = 'true';
624
625    #
626    # 1. Process delete files first
627    #
628    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
629    my @full_deleted_files = ();
630
631    # ensure all filenames are absolute
632    foreach my $df (@deleted_files) {
633        my $full_df =
634        (&util::filename_is_absolute($df))
635        ? $df
636        : &util::filename_cat($importdir,$df);
637
638        if (-d $full_df) {
639        &add_dir_contents_to_list($full_df, \@full_deleted_files);
640        } else {
641        push(@full_deleted_files,$full_df);
642        }
643    }
644   
645    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
646    mark_docs_for_deletion($archive_info,{},
647                   \@full_deleted_files,
648                   $archivedir, $verbosity, "delete");
649
650
651    #
652    # 2. Now files for reindexing
653    #
654
655    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
656    my @full_reindex_files = ();
657    # ensure all filenames are absolute
658    foreach my $rf (@reindex_files) {       
659        my $full_rf =
660        (&util::filename_is_absolute($rf))
661        ? $rf
662        : &util::filename_cat($importdir,$rf);
663
664        if (-d $full_rf) {
665        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
666        } else {
667        push(@full_reindex_files,$full_rf);
668        }
669    }
670   
671    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
672    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
673
674    # And now to ensure the new version of the file processed by
675    # appropriate plugin, we need to add it to block_hash reindex list
676    foreach my $full_rf (@full_reindex_files) {
677        $block_hash->{'reindex_files'}->{$full_rf} = 1;
678    }
679
680
681    #
682    # 3. Now finally any new files - add to block_hash new_files list
683    #
684
685    my @new_files = keys %{$manifest_lookup->{'index'}};
686    my @full_new_files = ();
687
688    foreach my $nf (@new_files) {
689        # ensure filename is absolute
690        my $full_nf =
691        (&util::filename_is_absolute($nf))
692        ? $nf
693        : &util::filename_cat($importdir,$nf);
694
695        if (-d $full_nf) {
696        &add_dir_contents_to_list($full_nf, \@full_new_files);
697        } else {
698        push(@full_new_files,$full_nf);
699        }
700    }
701
702    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
703      # need to check this file exists before trying to read it. [jmt12]
704      if (-e $arcinfo_src_filename)
705      {
706    my $arcinfodb_map = {};
707    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
708    foreach my $f (@full_new_files) {
709        # check that we haven't seen it already
710        if (defined $arcinfodb_map->{$f}) {
711        # TODO make better warning
712        print STDERR "Warning: $f already in src archive, \n";
713        } else {
714        $block_hash->{'new_files'}->{$f} = 1;
715        }
716    }
717    undef $arcinfodb_map;
718      }
719      # no existing files - so we can just add all the ones that need adding.
720      # [jmt12]
721      else
722      {
723        foreach my $f (@full_new_files)
724        {
725          $block_hash->{'new_files'}->{$f} = 1;
726        }
727      }
728
729      # If we are not using complex inherited metadata (and thus have skipped
730      # the global file scan) we need to at least check for a matching
731      # metadata.xml for the files being indexed/reindexed. [jmt12]
732      # - unless we are using the newer version of Manifests, which are treated
733      #   verbatim, and should have a metadata element for metadata files (so
734      #   we can explicitly process metadata files other than metadata.xml)
735      if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
736      {
737        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
738        foreach my $file_to_import (@all_files_to_import)
739        {
740          my $metadata_xml_path = $file_to_import;
741          $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
742          if (&util::file_exists($metadata_xml_path))
743          {
744            &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
745          }
746        }
747      }
748
749      if ($self->{'manifest_version'} > 0)
750      {
751        # Process metadata files (?)
752        # Process files
753        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
754        {
755          &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
756        }
757      }
758    }
759    else {
760    # if incremental, we read through the import folder to see whats changed.
761
762    if ($incremental || $incremental_mode eq "onlyadd") {
763        prime_doc_oid_count($archivedir);
764
765        # Can now work out which files were new, already existed, and have
766        # been deleted
767       
768        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
769                   $archivedir,$verbosity,$incremental_mode);
770       
771        my @new_files = sort keys %{$block_hash->{'new_files'}};
772        if (scalar(@new_files>0)) {
773        print STDERR "New files and modified metadata files since last import:\n  ";
774        print STDERR join("\n  ",@new_files), "\n";
775        }
776
777        if ($incremental) {
778               # only look for deletions if we are truely incremental
779        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
780        # Filter out any in gsdl/tmp area
781        my @filtered_deleted_files = ();
782        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
783        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
784        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
785        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
786                 
787        foreach my $df (@deleted_files) {
788            next if ($df =~ m/^$gsdl_tmp_area/);
789            next if ($df =~ m/^$collect_tmp_area/);
790           
791            push(@filtered_deleted_files,$df);
792        }       
793       
794
795        @deleted_files = @filtered_deleted_files;
796       
797        if (scalar(@deleted_files)>0) {
798            print STDERR "Files deleted since last import:\n  ";
799            print STDERR join("\n  ",@deleted_files), "\n";
800       
801       
802            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
803           
804            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
805        }
806       
807        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
808       
809        if (scalar(@reindex_files)>0) {
810            print STDERR "Files to reindex since last import:\n  ";
811            print STDERR join("\n  ",@reindex_files), "\n";
812            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
813            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
814        }
815               
816        }       
817    }
818    }
819
820    # Check for existence of the file that's to contain earliestDateStamp in archivesdir
821    # Do nothing if the file already exists (file exists on incremental build).
822    # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
823    # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
824    # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
825    # oailastmodified and oailastmodifieddate
826    my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
827    if (!-f $earliestDatestampFile && -d $archivedir) {
828    my $current_time_in_seconds = time; # in seconds
829
830    if(open(FOUT, ">$earliestDatestampFile")) {
831        # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
832        print FOUT $current_time_in_seconds;
833        close(FOUT);
834    }
835    else {
836        &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
837    }
838
839    }
840
841    # now, whichever mode we are in, we can process the entire import folder
842    if ((defined $jobs) && ($jobs > 1))
843    {
844    # if jobs are set to >1, run in parallel using MPI helper
845    # [hs, 1 july 2010]
846    &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
847                          $self->{'collection'}, $self->{'site'});
848    }
849    # only do this if we aren't using the newer paradigm for manifest files
850    elsif ($self->{'manifest_version'} < 1)
851    {
852    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
853    }
854   
855   
856    if ($saveas eq "FedoraMETS") {
857    # create collection "doc obj" for Fedora that contains
858    # collection-level metadata
859   
860    my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
861    $doc_obj->set_OID("collection");
862   
863    my $col_name = undef;
864    my $col_meta = $collectcfg->{'collectionmeta'};
865   
866    if (defined $col_meta) {       
867        store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
868        store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description     
869    }
870    $processor->process($doc_obj);
871    }
872
873    &plugin::end($pluginfo, $processor);
874
875    &plugin::deinit($pluginfo, $processor);
876
877    # Store the value of OIDCount (used in doc.pm) so it can be
878    # restored correctly to this value on an incremental build
879    # - this OIDcount file should only be generated for numerical oids [jmt12]
880    if ($self->{'OIDtype'} eq 'incremental')
881    {
882      store_doc_oid_count($archivedir);
883    }
884
885    # write out the archive information file
886    $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
887    $processor->close_group_output() if $processor->is_group();
888
889    # for backwards compatability with archvies.inf file
890    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
891    $archive_info->save_info($arcinfo_doc_filename);
892    }
893    else {
894    $archive_info->save_revinfo_db($arcinfo_src_filename);
895    }
896
897    return $pluginfo;
898}
899
900
901sub generate_statistics
902{
903    my $self = shift @_;
904    my ($pluginfo) = @_;
905
906    my $inexport_mode = $self->{'mode'};
907
908    my $statsfile   = $self->{'statsfile'};
909    my $out         = $self->{'out'};
910    my $faillogname = $self->{'faillogname'};
911    my $gli         = $self->{'gli'};
912    my $jobs        = $self->{'jobs'};
913
914    # write out import stats
915
916    if ((!defined $jobs) || ($jobs == 1))
917    {
918    # only output statistics if there are multiple jobs
919    # [hs, 1 july 2010]
920
921    my $close_stats = 0;
922    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
923        if (open (STATS, ">$statsfile")) {
924        $statsfile = 'inexport::STATS';
925        $close_stats = 1;
926        } else {
927        &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
928        &gsprintf($out, "{import.stats_backup}\n");
929        $statsfile = 'STDERR';
930        }
931    }
932   
933    &gsprintf($out, "\n");
934    &gsprintf($out, "*********************************************\n");
935    &gsprintf($out, "{$inexport_mode.complete}\n");
936    &gsprintf($out, "*********************************************\n");
937   
938    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
939    if ($close_stats) {
940        close STATS;
941    }
942    }
943
944    close OUT if $self->{'close_out'};
945    close FAILLOG;
946}
947
948
949sub store_collectionmeta
950{
951    my ($collectionmeta,$field,$doc_obj) = @_;
952   
953    my $section = $doc_obj->get_top_section();
954   
955    my $field_hash = $collectionmeta->{$field};
956   
957    foreach my $k (keys %$field_hash)
958    {
959    my $val = $field_hash->{$k};
960   
961    ### print STDERR "*** $k = $field_hash->{$k}\n";
962   
963    my $md_label = "ex.$field";
964   
965   
966    if ($k =~ m/^\[l=(.*?)\]$/)
967    {
968       
969        my $md_suffix = $1;
970        $md_label .= "^$md_suffix";
971    }
972   
973   
974    $doc_obj->add_utf8_metadata($section,$md_label, $val);
975   
976    # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
977    # while "collectionname" in GS2 is called "name" in GS3.
978    # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
979    if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
980    {
981        $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
982    }
983   
984    }
985}
986
987
988sub oid_count_file {
989    my ($archivedir) = @_;
990    return &util::filename_cat ($archivedir, "OIDcount");
991}
992
993
994sub prime_doc_oid_count
995{
996    my ($archivedir) = @_;
997    my $oid_count_filename = &oid_count_file($archivedir);
998
999    if (-e $oid_count_filename) {
1000    if (open(OIDIN,"<$oid_count_filename")) {
1001        my $OIDcount = <OIDIN>;
1002        chomp $OIDcount;       
1003        close(OIDIN);
1004
1005        $doc::OIDcount = $OIDcount;     
1006    }
1007    else {     
1008        &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1009    }
1010    }
1011   
1012}
1013
1014sub store_doc_oid_count
1015{
1016    # Use the file "OIDcount" in the archives directory to record
1017    # what value doc.pm got up to
1018
1019    my ($archivedir) = @_;
1020    my $oid_count_filename = &oid_count_file($archivedir);
1021
1022
1023    if (open(OIDOUT,&util::file_openfdcommand($oid_count_filename, '>'))) {
1024    print OIDOUT $doc::OIDcount, "\n";
1025       
1026    close(OIDOUT);
1027    }
1028    else {
1029    &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1030    }
1031}
1032
1033
1034
1035sub new_vs_old_import_diff
1036{
1037    my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1038
1039    # Get the infodbtype value for this collection from the arcinfo object
1040    my $infodbtype = $archive_info->{'infodbtype'};
1041
1042    # in this method, we want to know if metadata files are modified or not.
1043    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1044
1045    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1046
1047    # First convert all files to absolute form
1048    # This is to support the situation where the import folder is not
1049    # the default
1050   
1051    my $prev_all_files = $archive_info->{'prev_import_filelist'};
1052    my $full_prev_all_files = {};
1053
1054    foreach my $prev_file (keys %$prev_all_files) {
1055
1056    if (!&util::filename_is_absolute($prev_file)) {
1057        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
1058        $full_prev_all_files->{$full_prev_file} = $prev_file;
1059    }
1060    else {
1061        $full_prev_all_files->{$prev_file} = $prev_file;
1062    }
1063    }
1064
1065
1066    # Figure out which are the new files, existing files and so
1067    # by implication the files from the previous import that are not
1068    # there any more => mark them for deletion
1069    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1070   
1071    my $full_curr_file = $curr_file;
1072
1073    # entry in 'all_files' is moved to either 'existing_files',
1074    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1075
1076    if (!&util::filename_is_absolute($curr_file)) {
1077        # add in import dir to make absolute
1078        $full_curr_file = &util::filename_cat($importdir,$curr_file);
1079    }
1080
1081    # figure out if new file or not
1082    if (defined $full_prev_all_files->{$full_curr_file}) {
1083        # delete it so that only files that need deleting are left
1084        delete $full_prev_all_files->{$full_curr_file};
1085       
1086        # had it before. is it a metadata file?
1087        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1088       
1089        # is it modified??
1090        if (-M $full_curr_file < $archiveinf_timestamp) {
1091            print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1092            # its newer than last build
1093            $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1094        }
1095        }
1096        else {
1097        if ($incremental_mode eq "all") {
1098           
1099            # had it before
1100            $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1101           
1102        }
1103        else {
1104            # Warning in "onlyadd" mode, but had it before!
1105            print STDERR "Warning: File $full_curr_file previously imported.\n";
1106            print STDERR "         Treating as new file\n";
1107           
1108            $block_hash->{'new_files'}->{$full_curr_file} = 1;
1109           
1110        }
1111        }
1112    }
1113    else {
1114        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1115        # the new file is the special sort of file greenstone uses
1116        # to attach metadata to src documents
1117        # i.e metadata.xml
1118        # (but note, the filename used is not constrained in
1119        # Greenstone to always be this)
1120
1121        print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1122        $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1123        }
1124        else {
1125        $block_hash->{'new_files'}->{$full_curr_file} = 1;
1126        }
1127    }
1128
1129   
1130    delete $block_hash->{'all_files'}->{$curr_file};
1131    }
1132
1133
1134
1135
1136    # Deal with complication of new or modified metadata files by forcing
1137    # everything from this point down in the file hierarchy to
1138    # be freshly imported. 
1139    #
1140    # This may mean files that have not changed are reindexed, but does
1141    # guarantee by the end of processing all new metadata is correctly
1142    # associated with the relevant document(s).
1143
1144    foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1145    my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1146
1147    $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1148    $situated_dir =~ s/\\/\\\\/g;  # need to protect windows slash \ in regular expression
1149   
1150    # Go through existing_files, and mark anything that is contained
1151    # within 'situated_dir' to be reindexed (in case some of the metadata
1152    # attaches to one of these files)
1153
1154    my $reindex_files = [];
1155
1156    foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1157   
1158        if ($existing_f =~ m/^$situated_dir/) {
1159
1160        print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1161
1162        push(@$reindex_files,$existing_f);
1163        $block_hash->{'reindex_files'}->{$existing_f} = 1;
1164        delete $block_hash->{'existing_files'}->{$existing_f};
1165
1166        }
1167    }
1168   
1169    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1170    # (or equivalent)
1171    $block_hash->{'new_files'}->{$new_mdf} = 1;
1172
1173    }
1174
1175    # go through remaining existing files and work out what has changed and needs to be reindexed.
1176    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1177
1178    my $reindex_files = [];
1179
1180    foreach my $existing_filename (@existing_files) {
1181    if (-M $existing_filename < $archiveinf_timestamp) {
1182        # file is newer than last build
1183       
1184        my $existing_file = $existing_filename;
1185        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1186
1187        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1188        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1189       
1190        print STDERR "**** Reindexing existing file: $existing_file\n";
1191
1192        push(@$reindex_files,$existing_file);
1193        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1194    }
1195
1196    }
1197
1198   
1199    # By this point full_prev_all_files contains the files
1200    # mentioned in archiveinf-src.db but are not in the 'import'
1201    # folder (or whatever was specified through -importdir ...)
1202
1203    # This list can contain files that were created in the 'tmp' or
1204    # 'cache' areas (such as screen-size and thumbnail images).
1205    #
1206    # In building the final list of files to delete, we test to see if
1207    # it exists on the filesystem and if it does (unusual for a "normal"
1208    # file in import, but possible in the case of 'tmp' files),
1209    # supress it from going into the final list
1210
1211    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1212
1213    my @deleted_files = values %$full_prev_all_files;
1214    map { my $curr_file = $_;
1215      my $full_curr_file = $curr_file;
1216
1217      if (!&util::filename_is_absolute($curr_file)) {
1218          # add in import dir to make absolute
1219
1220          $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1221      }
1222
1223
1224      if (!-e $full_curr_file) {
1225          $block_hash->{'deleted_files'}->{$curr_file} = 1;
1226      }
1227      } @deleted_files;
1228
1229
1230
1231}
1232
1233
1234# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1235# $mode is 'delete' or 'reindex'
1236sub mark_docs_for_deletion
1237{
1238    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1239
1240    my $mode_text = "deleted from index";
1241    if ($mode eq "reindex") {
1242    $mode_text = "reindexed";
1243    }
1244
1245    # Get the infodbtype value for this collection from the arcinfo object
1246    my $infodbtype = $archive_info->{'infodbtype'};
1247
1248    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1249    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1250
1251
1252    # record files marked for deletion in arcinfo
1253    foreach my $file (@$deleted_files) {
1254    # use 'archiveinf-src' info database file to look up all the OIDs
1255    # that this file is used in (note in most cases, it's just one OID)
1256   
1257    my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1258    my $oids = $src_rec->{'oid'};
1259    my $file_record_deleted = 0;
1260
1261    # delete the src record
1262    my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1263    &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1264    &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1265
1266
1267    foreach my $oid (@$oids) {
1268
1269        # find the source doc (the primary file that becomes this oid)
1270        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1271        my $doc_source_file = $doc_rec->{'src-file'}->[0];
1272        if (!&util::filename_is_absolute($doc_source_file)) {
1273        $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1274        }
1275
1276        if ($doc_source_file ne $file) {
1277        # its an associated or metadata file
1278       
1279        # mark source doc for reimport as one of its assoc files has changed or deleted
1280        $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1281       
1282        }
1283        my $curr_status = $archive_info->get_status_info($oid);
1284        if (defined($curr_status) && (($curr_status ne "D"))) {
1285        if ($verbosity>1) {
1286            print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1287        }
1288        # mark oid for deletion (it will be deleted or reimported)
1289        $archive_info->set_status_info($oid,"D");
1290        my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1291        $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1292
1293        my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1294        my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1295
1296        &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1297        &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1298        }
1299    }
1300   
1301    }
1302
1303    # now go through and check that we haven't marked any primary
1304    # files for reindex (because their associated files have
1305    # changed/deleted) when they have been deleted themselves. only in
1306    # delete mode.
1307
1308    if ($mode eq "delete") {
1309    foreach my $file (@$deleted_files) {
1310        if (defined $block_hash->{'reindex_files'}->{$file}) {
1311        delete $block_hash->{'reindex_files'}->{$file};
1312        }
1313    }
1314    }
1315
1316
1317}
1318
1319sub add_dir_contents_to_list {
1320
1321    my ($dirname, $list) = @_;
1322 
1323    # Recur over directory contents.
1324    my (@dir, $subfile);
1325   
1326    # find all the files in the directory
1327    if (!opendir (DIR, $dirname)) {
1328    print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1329    return -1; # error in processing
1330    }
1331    @dir = readdir (DIR);
1332    closedir (DIR);
1333   
1334    for (my $i = 0; $i < scalar(@dir); $i++) {
1335    my $subfile = $dir[$i];
1336    next if ($subfile =~ m/^\.\.?$/);
1337    next if ($subfile =~ /^\.svn$/);
1338    my $full_file = &util::filename_cat($dirname, $subfile);
1339    if (-d $full_file) {
1340        &add_dir_contents_to_list($full_file, $list);
1341    } else {
1342        push (@$list, $full_file);
1343    }
1344    }
1345   
1346}
1347
1348
13491;
Note: See TracBrowser for help on using the browser.