root/gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm @ 30354

Revision 30354, 48.7 KB (checked in by jmt12, 5 years ago)

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49use Scalar::Util 'blessed';
50
51my $oidtype_list =
52    [ { 'name' => "hash",
53        'desc' => "{import.OIDtype.hash}" },
54      { 'name' => "hash_on_full_filename",
55        'desc' => "{import.OIDtype.hash_on_full_filename}" },
56      { 'name' => "assigned",
57        'desc' => "{import.OIDtype.assigned}" },
58      { 'name' => "incremental",
59        'desc' => "{import.OIDtype.incremental}" },
60      { 'name' => "filename",
61        'desc' => "{import.OIDtype.filename}" },
62      { 'name' => "dirname",
63        'desc' => "{import.OIDtype.dirname}" },
64      { 'name' => "full_filename",
65        'desc' => "{import.OIDtype.full_filename}" } ];
66
67$inexport::directory_arguments =
68[
69      { 'name' => "importdir",
70    'desc' => "{import.importdir}",
71    'type' => "string",
72    'reqd' => "no",
73    'deft' => "import",
74        'hiddengli' => "yes" },
75      { 'name' => "collectdir",
76    'desc' => "{import.collectdir}",
77    'type' => "string",
78    # parsearg left "" as default
79    #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
80    'deft' => "",
81    'reqd' => "no",
82        'hiddengli' => "yes" },
83 
84];
85$inexport::arguments =
86[
87      # don't set the default to hash - want to allow this to come from
88      # entry in collect.cfg but want to override it here
89      { 'name' => "OIDtype",
90    'desc' => "{import.OIDtype}",
91    'type' => "enum",
92    'list' => $oidtype_list,
93    'deft' => "hash_on_full_filename",
94    'reqd' => "no",
95    'modegli' => "2" },
96      { 'name' => "OIDmetadata",
97    'desc' => "{import.OIDmetadata}",
98    'type' => "string",
99    'deft' => "dc.Identifier",
100    'reqd' => "no",
101    'modegli' => "2" },
102      { 'name' => "site",
103    'desc' => "{import.site}",
104    'type' => "string",
105    'deft' => "",
106    'reqd' => "no",
107        'hiddengli' => "yes" },
108      { 'name' => "manifest",
109    'desc' => "{import.manifest}",
110    'type' => "string",
111    'deft' => "",
112    'reqd' => "no",
113        'hiddengli' => "yes" } ,
114     { 'name' => "incremental",
115    'desc' => "{import.incremental}",
116    'type' => "flag",
117    'hiddengli' => "yes" },
118      { 'name' => "keepold",
119    'desc' => "{import.keepold}",
120    'type' => "flag",
121    'reqd' => "no",
122    'hiddengli' => "yes" },
123      { 'name' => "removeold",
124    'desc' => "{import.removeold}",
125    'type' => "flag",
126    'reqd' => "no",
127    'hiddengli' => "yes" },
128      { 'name' => "language",
129    'desc' => "{scripts.language}",
130    'type' => "string",
131    'reqd' => "no",
132    'hiddengli' => "yes" },
133      { 'name' => "maxdocs",
134    'desc' => "{import.maxdocs}",
135    'type' => "int",
136    'reqd' => "no",
137    'deft' => "-1",
138    'range' => "-1,",
139    'modegli' => "1" },
140       { 'name' => "debug",
141    'desc' => "{import.debug}",
142    'type' => "flag",
143    'reqd' => "no",
144        'hiddengli' => "yes" },
145      { 'name' => "faillog",
146    'desc' => "{import.faillog}",
147    'type' => "string",
148    # parsearg left "" as default
149    #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
150    'deft' => "",
151    'reqd' => "no",
152        'modegli' => "3" },
153       { 'name' => "out",
154    'desc' => "{import.out}",
155    'type' => "string",
156    'deft' => "STDERR",
157    'reqd' => "no",
158        'hiddengli' => "yes" },
159      { 'name' => "statsfile",
160    'desc' => "{import.statsfile}",
161    'type' => "string",
162    'deft' => "STDERR",
163    'reqd' => "no",
164        'hiddengli' => "yes" },
165      { 'name' => "verbosity",
166    'desc' => "{import.verbosity}",
167    'type' => "int",
168    'range' => "0,",
169    'deft' => "2",
170    'reqd' => "no",
171    'modegli' => "3" },
172      { 'name' => "gli",
173    'desc' => "{scripts.gli}",
174    'type' => "flag",
175    'reqd' => "no",
176    'hiddengli' => "yes" },
177      { 'name' => "xml",
178    'desc' => "{scripts.xml}",
179    'type' => "flag",
180    'reqd' => "no",
181    'hiddengli' => "yes" },
182
183];
184
185sub new
186{
187    my $class = shift (@_);
188    my ($mode,$argv,$options,$opt_listall_options) = @_;
189
190    my $self = { 'xml' => 0, 'mode' => $mode };
191
192    # general options available to all plugins
193    my $arguments = $options->{'args'};
194    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
195    # Parse returns -1 if something has gone wrong
196    if ($intArgLeftinAfterParsing == -1)
197    {
198    &PrintUsage::print_txt_usage($options, "{import.params}",1);
199    print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
200    die "\n";
201    }
202
203    if ($self->{'verbosity'} > 2) {
204    print "[INFO] This inexport.pm supports version 2 manifest files\n";
205    }
206    if ($self->{'verbosity'} > 3) {
207    print '[DEBUG] Perl @INC: ' . join(", ", @INC) . "\n";
208    }
209
210    my $language = $self->{'language'};
211    # If $language has been specified, load the appropriate resource bundle
212    # (Otherwise, the default resource bundle will be loaded automatically)
213    if ($language && $language =~ /\S/) {
214    &gsprintf::load_language_specific_resource_bundle($language);
215    }
216
217    if ($self->{'listall'}) {
218    if ($self->{'xml'}) {
219        &PrintUsage::print_xml_usage($opt_listall_options);
220    }
221    else
222    {
223        &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
224    }
225    die "\n";
226    }
227
228    if ($self->{'xml'}) {
229        &PrintUsage::print_xml_usage($options);
230    print "\n";
231    return bless $self, $class;
232    }
233
234    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
235    &gsprintf::output_strings_in_UTF8;
236    }
237
238    # If the user specified -h, then we output the usage
239    if (@$argv && $argv->[0] =~ /^\-+h/) {
240    &PrintUsage::print_txt_usage($options, "{import.params}");
241    die "\n";
242    }
243    # now check that we had exactly one leftover arg, which should be
244    # the collection name. We don't want to do this earlier, cos
245    # -xml arg doesn't need a collection name
246
247    if ($intArgLeftinAfterParsing != 1 )
248    {
249    &PrintUsage::print_txt_usage($options, "{import.params}", 1);
250    print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
251    die "\n";
252    }
253
254    $self->{'close_out'} = 0;
255    my $out = $self->{'out'};
256    if ($out !~ /^(STDERR|STDOUT)$/i) {
257    open (OUT, ">$out") ||
258        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
259    $out = 'inexport::OUT';
260    $self->{'close_out'} = 1;
261    }
262    $out->autoflush(1);
263    $self->{'out'} = $out;
264
265    my $statsfile = $self->{'statsfile'};
266    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
267    open (STATSFILE, ">$statsfile") ||
268        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
269    $statsfile = 'inexport::STATSFILE';
270    $self->{'close_stats'} = 1;
271    }
272    $statsfile->autoflush(1);
273    $self->{'statsfile'} = $statsfile;
274
275    # @ARGV should be only one item, the name of the collection
276    $self->{'collection'} = shift @$argv;
277
278    # Unless otherwise stated all manifests are considered version 1---where
279    # they act more like an advanced process expression---as compared to newer
280    # manifest files that act as an explicit (and exhaustive) list of files to
281    # process [jmt12]
282    $self->{'manifest_version'} = 1;
283
284    return bless $self, $class;
285}
286
287# Simplified version of the contstructor for use with CGI scripts
288sub newCGI
289{
290    my $class = shift (@_);
291    my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
292
293    my $self = { 'xml' => 0, 'mode' => $mode };
294
295    $self->{'out'} = STDERR;
296   
297    if (defined $gsdl_cgi) {
298        $self->{'site'} = $opt_site;
299        my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
300        $self->{'collectdir'} = $collect_dir;
301    }
302    else { 
303        $self->{'site'} = "";
304        $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
305    }
306    $self->{'faillog'} = "";
307   
308    $self->{'collection'} = $collect;
309
310    return bless $self, $class;
311}
312sub get_collection
313{
314    my $self = shift @_;
315   
316    return $self->{'collection'};
317}
318
319
320sub read_collection_cfg
321{
322    my $self = shift @_;
323    my ($collection,$options) = @_;
324
325    my $collectdir = $self->{'collectdir'};
326    my $site       = $self->{'site'};
327    my $out        = $self->{'out'};
328     
329    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
330    #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
331    die "\n";
332    }
333
334    # set gs_version 2/3
335    $self->{'gs_version'} = "2";
336    if ((defined $site) && ($site ne "")) {
337    # gs3
338    $self->{'gs_version'} = "3";
339    }
340
341    # add collection's perllib dir into include path in
342    # case we have collection specific modules
343    &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
344
345    # check that we can open the faillog
346    my $faillog = $self->{'faillog'};
347    if ($faillog eq "") {
348    $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
349    }
350    open (FAILLOG, ">$faillog") ||
351    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
352
353   
354    my $faillogname = $faillog;
355    $faillog = 'inexport::FAILLOG';
356    $faillog->autoflush(1);
357    $self->{'faillog'} = $faillog;
358    $self->{'faillogname'} = $faillogname;
359    $self->{'close_faillog'} = 1;
360
361    # Read in the collection configuration file.
362    my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
363    my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
364    my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
365
366    return ($config_filename,$collectcfg);
367}
368
369sub set_collection_options
370{
371    my $self = shift @_;
372    my ($collectcfg) = @_;
373
374    my $inexport_mode = $self->{'mode'};
375
376    my $importdir  = $self->{'importdir'};
377    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
378    my $out        = $self->{'out'};
379
380    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
381    if (!defined($collectcfg->{'infodbtype'}))
382    {
383      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
384    }
385    if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
386    # we can't use the text version for archives dbs.
387    $collectcfg->{'infodbtype'} = "gdbm";
388    }
389
390    if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
391    $importdir = $collectcfg->{'importdir'};
392    }
393
394    if ($inexport_mode eq "import") {
395    if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
396        $archivedir = $collectcfg->{'archivedir'};
397    }
398    }
399    elsif ($inexport_mode eq "export") {
400    if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
401        $archivedir = $collectcfg->{'exportdir'};
402    }
403    }
404    # fill in the default import and archives directories if none
405    # were supplied, turn all \ into / and remove trailing /
406    if (!&FileUtils::isFilenameAbsolute($importdir))
407    {
408      $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
409    }
410    else
411    {
412      # Don't do this - it kills protocol prefixes
413      #$importdir =~ s/[\\\/]+/\//g;
414      #$importdir =~ s/\/$//;
415      # Do this instead
416      &FileUtils::sanitizePath($importdir);
417    }
418    if (!&FileUtils::directoryExists($importdir))
419    {
420      &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
421      die "\n";
422    }
423    $self->{'importdir'} = $importdir;
424
425    if (!&FileUtils::isFilenameAbsolute($archivedir)) {
426    $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
427    }
428    else {
429   
430    $archivedir = &FileUtils::sanitizePath($archivedir);
431    }
432    $self->{'archivedir'} = $archivedir;
433
434    if (defined $self->{'default_verbosity'}) {
435    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
436        $self->{'verbosity'} = $collectcfg->{'verbosity'};
437    }
438    }
439 
440    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
441    $self->{'manifest'} = $collectcfg->{'manifest'};
442    }
443
444    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
445    if ($collectcfg->{'gzip'} =~ /^true$/i) {
446        $self->{'gzip'} = 1;
447    }
448    }
449
450    if (defined $self->{'default_maxdocs'}) {
451    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
452        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
453    }
454    }
455
456   
457
458    if (defined $self->{'default_OIDtype'} ) {
459    if (defined $collectcfg->{'OIDtype'}
460        && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
461        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
462    }
463    }
464
465    if (defined $self->{'default_OIDmetadata'}) {
466    if (defined $collectcfg->{'OIDmetadata'}) {
467        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
468    }
469    }
470
471    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
472    $self->{'debug'} = 1;
473    }
474    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
475    $self->{'gli'} = 1;
476    }
477    $self->{'gli'} = 0 unless defined $self->{'gli'};
478       
479    # check keepold and removeold
480    my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
481
482    my ($removeold, $keepold, $incremental, $incremental_mode)
483    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
484                           $self->{'incremental'}, $checkdir,
485                           $collectcfg);
486
487    $self->{'removeold'}        = $removeold;
488    $self->{'keepold'}          = $keepold;
489    $self->{'incremental'}      = $incremental;
490    $self->{'incremental_mode'} = $incremental_mode;
491
492    # We'll need direct access to this plugin to support v2 manifests
493    $self->{'directoryplugin'} = 0;
494}
495
496sub process_files
497{
498    my $self = shift @_;
499    my ($config_filename,$collectcfg) = @_;
500
501    my $inexport_mode = $self->{'mode'};
502
503    my $verbosity   = $self->{'verbosity'};
504    my $debug       = $self->{'debug'};
505
506    my $importdir   = $self->{'importdir'};
507    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
508
509    my $incremental = $self->{'incremental'};
510    my $incremental_mode = $self->{'incremental_mode'};
511
512    my $gs_version = $self->{'gs_version'};
513
514    my $removeold   = $self->{'removeold'};
515    my $keepold     = $self->{'keepold'};
516
517    my $saveas      = $self->{'saveas'};
518    my $saveas_options = $self->{'saveas_options'};
519    my $OIDtype     = $self->{'OIDtype'};
520    my $OIDmetadata = $self->{'OIDmetadata'};
521
522    my $out         = $self->{'out'};
523    my $faillog     = $self->{'faillog'};
524
525    my $maxdocs     = $self->{'maxdocs'};
526    my $gzip        = $self->{'gzip'};
527    my $groupsize   = $self->{'groupsize'};
528    my $sortmeta    = $self->{'sortmeta'};
529
530    my $removeprefix = $self->{'removeprefix'};
531    my $removesuffix = $self->{'removesuffix'};
532
533    my $gli          = $self->{'gli'};
534
535    # related to export
536    my $xsltfile         = $self->{'xsltfile'};
537    my $group_marc       = $self->{'group_marc'};
538    my $mapping_file     = $self->{'mapping_file'};
539    my $xslt_mets        = $self->{'xslt_mets'};
540    my $xslt_txt         = $self->{'xslt_txt'};
541    my $fedora_namespace = $self->{'fedora_namespace'};
542    my $metadata_prefix  = $self->{'metadata_prefix'};
543
544    if ($inexport_mode eq "import") {
545    print STDERR "<Import>\n" if $gli;
546    }
547    else {
548    print STDERR "<export>\n" if $gli;
549    }
550
551    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
552    if ($self->{'manifest'} ne "") {
553    my $manifest_filename = $self->{'manifest'};
554
555    if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
556        $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
557    }
558        $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
559    #$self->{'manifest'} =~ s/[\\\/]+/\//g;
560    #$self->{'manifest'} =~ s/\/$//;
561
562    $manifest_lookup->parse($manifest_filename);
563
564        # manifests may now include a version number [jmt12]
565        $self->{'manifest_version'} = $manifest_lookup->get_version();
566    }
567
568    my $manifest = $self->{'manifest'};
569
570    # load all the plugins
571    my $plugins = [];
572    if (defined $collectcfg->{'plugin'}) {
573    $plugins = $collectcfg->{'plugin'};
574    }
575
576    my $plugin_incr_mode = $incremental_mode;
577    if ($manifest ne "") {
578    # if we have a manifest file, then we pretend we are fully incremental for plugins
579    $plugin_incr_mode = "all";
580    }
581    #some global options for the plugins
582    my @global_opts = ();
583
584    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
585    if (scalar(@$pluginfo) == 0) {
586    &gsprintf($out, "{import.no_plugins_loaded}\n");
587    die "\n";
588    }
589    # Store a reference to the DirectoryPlugin
590    foreach my $a_plugin (@{$pluginfo})
591    {
592    if (blessed ($a_plugin) eq 'DirectoryPlugin')
593    {
594        $self->{'directoryplugin'} = $a_plugin;
595    }
596    }
597    # No directory plugin - no v2 manifest support
598    if ($self->{'directoryplugin'} == 0)
599    {
600    print STDERR "WARNING: DirectoryPlugin not loaded: metadata.xml files not supported.\n";
601    }
602
603    # remove the old contents of the archives directory (and tmp
604    # directory) if needed
605
606    if ($removeold) {
607    if (&FileUtils::directoryExists($archivedir)) {
608        &gsprintf($out, "{import.removing_archives}\n");
609        &FileUtils::removeFilesRecursive($archivedir);
610    }
611    my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
612    $tmpdir =~ s/[\\\/]+/\//g;
613    $tmpdir =~ s/\/$//;
614    if (&FileUtils::directoryExists($tmpdir)) {
615        &gsprintf($out, "{import.removing_tmpdir}\n");
616        &FileUtils::removeFilesRecursive($tmpdir);
617    }
618    }
619
620    # create the archives dir if needed
621    &FileUtils::makeAllDirectories($archivedir);
622
623    # read the archive information file
624
625    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
626    &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
627    &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
628
629    # When we make these initial calls to determine the archive information doc
630    # and src databases we pass through a '1' to indicate this is the first
631    # time we are referring to these databases. When using dynamic dbutils
632    # (available in extensions) this indicates to some database types (for
633    # example, persistent servers) that this is a good time to perform any
634    # one time initialization. The argument has no effect on vanilla dbutils
635    # [jmt12]
636    my $perform_firsttime_init = 1;
637    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
638    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
639
640    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
641    $archive_info->load_info ($arcinfo_doc_filename);
642
643    if ($manifest eq "") {
644    # Load in list of files in import folder from last import (if present)
645    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
646    }
647
648    ####Use Plugout####
649    my $plugout;
650
651    my $generate_auxiliary_files = 0;
652    if ($inexport_mode eq "import") {
653    $generate_auxiliary_files = 1;
654    }
655    elsif ($self->{'include_auxiliary_database_files'}) {
656    $generate_auxiliary_files = 1;
657    }
658    $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
659
660    # Option to use user defined plugout
661    if ($inexport_mode eq "import") {
662    if (defined $collectcfg->{'plugout'}) {
663        # If a plugout was specified in the collect.cfg file, assume it is sensible
664        # We can't check the name because it could be anything, if it is a custom plugout
665        print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
666        $plugout = $collectcfg->{'plugout'};
667    }
668    else {
669        push @$plugout,$saveas."Plugout";
670    }
671
672    }
673    else {
674    if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
675        $plugout = $collectcfg->{'plugout'};
676        print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
677    }
678    else {
679        push @$plugout,$saveas."Plugout";
680    }
681    }
682
683    my $plugout_name = $plugout->[0];
684
685    if ($inexport_mode eq "export" && defined $saveas_options) {
686    my @user_plugout_options = split(" ", $saveas_options);
687    push @$plugout, @user_plugout_options;
688    }
689    push @$plugout,("-output_info",$archive_info)  if (defined $archive_info);
690    push @$plugout,("-verbosity",$verbosity)       if (defined $verbosity);
691    push @$plugout,("-debug")                      if ($debug);
692    push @$plugout,("-gzip_output")                if ($gzip);
693    push @$plugout,("-output_handle",$out)         if (defined $out);
694
695    push @$plugout,("-xslt_file",$xsltfile)        if (defined $xsltfile && $xsltfile ne "");
696    push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
697    if ($inexport_mode eq "import") {
698    if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
699        push @$plugout,("-group_size",$groupsize)      if (defined $groupsize);
700    }
701    }
702    my $processor = &plugout::load_plugout($plugout);
703    $processor->setoutputdir ($archivedir);
704    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
705    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
706    $processor->begin();
707    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
708   
709    if ($removeold) {
710        # occasionally, plugins may want to do something on remove
711        # old, eg pharos image indexing
712    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
713    }
714
715    # process the import directory
716    my $block_hash = {};
717    $block_hash->{'new_files'} = {};
718    $block_hash->{'reindex_files'} = {};
719    # all of these are set somewhere else, so it's more readable to define them
720    # here [jmt12]
721    $block_hash->{'all_files'} = {};
722    $block_hash->{'deleted_files'} = {};
723    $block_hash->{'file_blocks'} = {};
724    $block_hash->{'metadata_files'} = {};
725    $block_hash->{'shared_fileroot'} = '';
726    # a new flag so we can tell we had a manifest way down in the plugins
727    # [jmt12]
728    $block_hash->{'manifest'} = 'false';
729    my $metadata = {};
730   
731    # global blocking pass may set up some metadata
732    # does this set up metadata?????
733    # - when we have a newer manifest file we don't do this -unless- the
734    #   collection configuration indicates this collection contains complex
735    #   (inherited) metadata [jmt12]
736    if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
737    {
738      &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
739    }
740    else
741    {
742      print STDERR "Skipping import directory-level global file scan due to manifest and complexmeta configuration\n";
743    }
744
745    if ($manifest ne "") {
746
747      # mark that we are using a manifest - information that might be needed
748      # down in plugins (for instance DirectoryPlugin)
749      $block_hash->{'manifest'} = $self->{'manifest_version'};
750
751    #
752    # 1. Process delete files first
753    #
754    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
755    my @full_deleted_files = ();
756
757    # ensure all filenames are absolute
758    foreach my $df (@deleted_files) {
759        my $full_df =
760        (&FileUtils::isFilenameAbsolute($df))
761        ? $df
762        : &FileUtils::filenameConcatenate($importdir,$df);
763
764        if (-d $full_df && $self->{'manifest_version'} != 2) {
765        &add_dir_contents_to_list($full_df, \@full_deleted_files);
766        } else {
767        push(@full_deleted_files,$full_df);
768        }
769    }
770   
771    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
772    mark_docs_for_deletion($archive_info,{},
773                   \@full_deleted_files,
774                   $archivedir, $verbosity, "delete");
775
776
777    #
778    # 2. Now files for reindexing
779    #
780
781    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
782    my @full_reindex_files = ();
783    # ensure all filenames are absolute
784    foreach my $rf (@reindex_files) {       
785        my $full_rf =
786        (&FileUtils::isFilenameAbsolute($rf))
787        ? $rf
788        : &FileUtils::filenameConcatenate($importdir,$rf);
789
790        if (-d $full_rf && $self->{'manifest_version'} != 2) {
791        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
792        } else {
793        push(@full_reindex_files,$full_rf);
794        }
795    }
796   
797    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
798    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
799
800    # And now to ensure the new version of the file processed by
801    # appropriate plugin, we need to add it to block_hash reindex list
802    foreach my $full_rf (@full_reindex_files) {
803        $block_hash->{'reindex_files'}->{$full_rf} = 1;
804    }
805
806
807    #
808    # 3. Now finally any new files - add to block_hash new_files list
809    #
810
811    my @new_files = keys %{$manifest_lookup->{'index'}};
812    my @full_new_files = ();
813
814    foreach my $nf (@new_files) {
815        # ensure filename is absolute
816        my $full_nf =
817        (&FileUtils::isFilenameAbsolute($nf))
818        ? $nf
819        : &FileUtils::filenameConcatenate($importdir,$nf);
820
821        if (-d $full_nf && $self->{'manifest_version'} != 2) {
822        &add_dir_contents_to_list($full_nf, \@full_new_files);
823        } else {
824        push(@full_new_files,$full_nf);
825        }
826    }
827
828    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
829      # need to check this file exists before trying to read it - in the past
830      # it wasn't possible to have a manifest unless keepold was also set so
831      # you were pretty much guaranteed arcinfo existed
832      # [jmt12]
833      # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
834      if (-e $arcinfo_src_filename)
835      {
836    my $arcinfodb_map = {};
837    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
838    foreach my $f (@full_new_files) {
839        my $rel_f = &util::abspath_to_placeholders($f);
840
841        # check that we haven't seen it already
842        if (defined $arcinfodb_map->{$rel_f}) {
843        # TODO make better warning
844        print STDERR "Warning: $f ($rel_f) already in src archive, \n";
845        } else {
846        $block_hash->{'new_files'}->{$f} = 1;
847        }
848    }
849
850    undef $arcinfodb_map;
851      }
852      # no existing files - so we can just add all the files [jmt12]
853      else
854      {
855        foreach my $f (@full_new_files)
856        {
857          $block_hash->{'new_files'}->{$f} = 1;
858        }
859      }
860
861      # If we are not using complex inherited metadata (and thus have skipped
862      # the global file scan) we need to at least check for a matching
863      # metadata.xml for the files being indexed/reindexed
864      # - unless we are using the newer version of Manifests, which are treated
865      #   verbatim, and should have a metadata element for metadata files (so
866      #   we can explicitly process metadata files other than metadata.xml)
867      # [jmt12]
868      if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
869      {
870        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
871        foreach my $file_to_import (@all_files_to_import)
872        {
873          my $metadata_xml_path = $file_to_import;
874          $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
875          if (&FileUtils::fileExists($metadata_xml_path))
876          {
877            &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
878          }
879        }
880      }
881
882      # new version manifest files explicitly list files to be processed and
883      # only support 'simplemeta' format (ignoring complexmeta if set) in that
884      # each document can be accompanied by a metadata.xml file in the same
885      # directory. The metadata.xml can only apply to the fileset ".*".
886      # [jmt12]
887      if ($self->{'manifest_version'} > 1)
888      {
889        # Process metadata files
890        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
891        {
892        if (&FileUtils::directoryExists($file_to_import)) {
893#       print "DEBUG: Directory to import: \"" . $file_to_import . "\"\n";
894        &plugin::file_block_read($pluginfo, '', $file_to_import, $block_hash, $metadata, $gli);
895#       print "\n===== BLOCK HASH =====\n";
896#       Dump($block_hash);
897#       print "\n=====            =====\n\n";
898        $self->perform_process_files($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
899        }
900        else
901        {
902#       print "DEBUG: File to import: \"" . $file_to_import . "\"\n";
903        $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli);
904        }
905        }
906      }
907    }
908    else {
909    # if incremental, we read through the import folder to see whats changed.
910
911    if ($incremental || $incremental_mode eq "onlyadd") {
912        prime_doc_oid_count($archivedir);
913
914        # Can now work out which files were new, already existed, and have
915        # been deleted
916       
917        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
918                   $archivedir,$verbosity,$incremental_mode);
919       
920        my @new_files = sort keys %{$block_hash->{'new_files'}};
921        if (scalar(@new_files>0)) {
922        print STDERR "New files and modified metadata files since last import:\n  ";
923        print STDERR join("\n  ",@new_files), "\n";
924        }
925
926        if ($incremental) {
927               # only look for deletions if we are truely incremental
928        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
929        # Filter out any in gsdl/tmp area
930        my @filtered_deleted_files = ();
931        my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
932        my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
933        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
934        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
935                 
936        foreach my $df (@deleted_files) {
937            next if ($df =~ m/^$gsdl_tmp_area/);
938            next if ($df =~ m/^$collect_tmp_area/);
939           
940            push(@filtered_deleted_files,$df);
941        }       
942       
943
944        @deleted_files = @filtered_deleted_files;
945       
946        if (scalar(@deleted_files)>0) {
947            print STDERR "Files deleted since last import:\n  ";
948            print STDERR join("\n  ",@deleted_files), "\n";
949       
950       
951            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
952           
953            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
954        }
955       
956        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
957       
958        if (scalar(@reindex_files)>0) {
959            print STDERR "Files to reindex since last import:\n  ";
960            print STDERR join("\n  ",@reindex_files), "\n";
961            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
962            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
963        }
964               
965        }       
966    }
967    }
968
969    # Check for existence of the file that's to contain earliestDateStamp in archivesdir
970    # Do nothing if the file already exists (file exists on incremental build).
971    # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
972    # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
973    # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
974    # oailastmodified and oailastmodifieddate
975    my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
976    if ($self->{'generate_auxiliary_files'}) {
977    if (!-f $earliestDatestampFile && -d $archivedir) {
978    my $current_time_in_seconds = time; # in seconds
979
980    if(open(FOUT, ">$earliestDatestampFile")) {
981        # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
982        print FOUT $current_time_in_seconds;
983        close(FOUT);
984    }
985    else {
986        &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
987    }
988
989    }
990    }
991
992    if ($self->{'manifest_version'} != 2)
993    {
994    $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
995    }
996   
997    if ($saveas eq "FedoraMETS") {
998    # create collection "doc obj" for Fedora that contains
999    # collection-level metadata
1000   
1001    my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1002    $doc_obj->set_OID("collection");
1003   
1004    my $col_name = undef;
1005    my $col_meta = $collectcfg->{'collectionmeta'};
1006   
1007    if (defined $col_meta) {       
1008        store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1009        store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description     
1010    }
1011    $processor->process($doc_obj);
1012    }
1013
1014    &plugin::end($pluginfo, $processor);
1015
1016    &plugin::deinit($pluginfo, $processor);
1017
1018    # Store the value of OIDCount (used in doc.pm) so it can be
1019    # restored correctly to this value on an incremental build
1020    # - this OIDcount file should only be generated for numerical oids [jmt12]
1021    if ($self->{'OIDtype'} eq 'incremental')
1022    {
1023      store_doc_oid_count($archivedir);
1024    }
1025
1026    # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1027    $processor->close_group_output() if $processor->is_group();
1028
1029#    if ($inexport_mode eq "import") {
1030    if ($self->{'generate_auxiliary_files'}) {
1031    # write out the archive information file
1032    # for backwards compatability with archvies.inf file
1033    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1034        $archive_info->save_info($arcinfo_doc_filename);
1035    }
1036    else {
1037        $archive_info->save_revinfo_db($arcinfo_src_filename);
1038    }
1039    }
1040    return $pluginfo;
1041}
1042
1043# @function perform_process_files()
1044# while process_files() above prepares the system to import files this is the
1045# function that actually initiates the plugin pipeline to process the files.
1046# This function the therefore be overridden in subclasses of inexport.pm should
1047# they wish to do different or further processing
1048# @author jmt12
1049sub perform_process_files
1050{
1051  my $self = shift(@_);
1052  my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1053  my $gli = $self->{'gli'};
1054  # specific file to process - via manifest version 2+
1055  if ($file_to_import ne '')
1056  {
1057    &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1058  }
1059  # global file scan - if we are using a new version manifest, files would have
1060  # been read above. Older manifests use extra settings in the $block_hash to
1061  # control what is imported, while non-manifest imports use a regular
1062  # $block_hash (so obeying process_exp and block_exp) [jmt12]
1063  elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1064  {
1065    &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1066  }
1067  else
1068  {
1069    print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1070  }
1071}
1072# perform_process_files()
1073
1074# @function generate_statistics()
1075sub generate_statistics
1076{
1077  my $self = shift @_;
1078  my ($pluginfo) = @_;
1079
1080  my $inexport_mode = $self->{'mode'};
1081  my $out           = $self->{'out'};
1082  my $faillogname   = $self->{'faillogname'};
1083  my $statsfile     = $self->{'statsfile'};
1084  my $gli           = $self->{'gli'};
1085
1086  &gsprintf($out, "\n");
1087  &gsprintf($out, "*********************************************\n");
1088  &gsprintf($out, "{$inexport_mode.complete}\n");
1089  &gsprintf($out, "*********************************************\n");
1090
1091  &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1092}
1093# generate_statistics()
1094
1095
1096# @function deinit()
1097# Close down any file handles that we opened (and hence are responsible for
1098# closing
1099sub deinit
1100{
1101  my $self = shift(@_);
1102  close OUT if $self->{'close_out'};
1103  close FAILLOG if $self->{'close_faillog'};
1104  close STATSFILE if $self->{'close_statsfile'};
1105}
1106# deinit()
1107
1108
1109sub store_collectionmeta
1110{
1111    my ($collectionmeta,$field,$doc_obj) = @_;
1112   
1113    my $section = $doc_obj->get_top_section();
1114   
1115    my $field_hash = $collectionmeta->{$field};
1116   
1117    foreach my $k (keys %$field_hash)
1118    {
1119    my $val = $field_hash->{$k};
1120   
1121    ### print STDERR "*** $k = $field_hash->{$k}\n";
1122   
1123    my $md_label = "ex.$field";
1124   
1125   
1126    if ($k =~ m/^\[l=(.*?)\]$/)
1127    {
1128       
1129        my $md_suffix = $1;
1130        $md_label .= "^$md_suffix";
1131    }
1132   
1133   
1134    $doc_obj->add_utf8_metadata($section,$md_label, $val);
1135   
1136    # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1137    # while "collectionname" in GS2 is called "name" in GS3.
1138    # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1139    if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1140    {
1141        $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1142    }
1143   
1144    }
1145}
1146
1147
1148sub oid_count_file {
1149    my ($archivedir) = @_;
1150    return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1151}
1152
1153
1154sub prime_doc_oid_count
1155{
1156    my ($archivedir) = @_;
1157    my $oid_count_filename = &oid_count_file($archivedir);
1158
1159    if (-e $oid_count_filename) {
1160    if (open(OIDIN,"<$oid_count_filename")) {
1161        my $OIDcount = <OIDIN>;
1162        chomp $OIDcount;       
1163        close(OIDIN);
1164
1165        $doc::OIDcount = $OIDcount;     
1166    }
1167    else {     
1168        &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1169    }
1170    }
1171   
1172}
1173
1174sub store_doc_oid_count
1175{
1176    # Use the file "OIDcount" in the archives directory to record
1177    # what value doc.pm got up to
1178
1179    my ($archivedir) = @_;
1180    my $oid_count_filename = &oid_count_file($archivedir);
1181
1182    # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1183    if (open(OIDOUT,">$oid_count_filename")) {
1184    print OIDOUT $doc::OIDcount, "\n";
1185       
1186    close(OIDOUT);
1187    }
1188    else {
1189    &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1190    }
1191}
1192
1193
1194
1195sub new_vs_old_import_diff
1196{
1197    my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1198
1199    # Get the infodbtype value for this collection from the arcinfo object
1200    my $infodbtype = $archive_info->{'infodbtype'};
1201
1202    # in this method, we want to know if metadata files are modified or not.
1203    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1204
1205    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1206
1207    # First convert all files to absolute form
1208    # This is to support the situation where the import folder is not
1209    # the default
1210   
1211    my $prev_all_files = $archive_info->{'prev_import_filelist'};
1212    my $full_prev_all_files = {};
1213
1214    foreach my $prev_file (keys %$prev_all_files) {
1215
1216    if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1217        my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1218        $full_prev_all_files->{$full_prev_file} = $prev_file;
1219    }
1220    else {
1221        $full_prev_all_files->{$prev_file} = $prev_file;
1222    }
1223    }
1224
1225
1226    # Figure out which are the new files, existing files and so
1227    # by implication the files from the previous import that are not
1228    # there any more => mark them for deletion
1229    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1230   
1231    my $full_curr_file = $curr_file;
1232
1233    # entry in 'all_files' is moved to either 'existing_files',
1234    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1235
1236    if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1237        # add in import dir to make absolute
1238        $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1239    }
1240
1241    # figure out if new file or not
1242    if (defined $full_prev_all_files->{$full_curr_file}) {
1243        # delete it so that only files that need deleting are left
1244        delete $full_prev_all_files->{$full_curr_file};
1245       
1246        # had it before. is it a metadata file?
1247        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1248       
1249        # is it modified??
1250        if (-M $full_curr_file < $archiveinf_timestamp) {
1251            print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1252            # its newer than last build
1253            $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1254        }
1255        }
1256        else {
1257        if ($incremental_mode eq "all") {
1258           
1259            # had it before
1260            $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1261           
1262        }
1263        else {
1264            # Warning in "onlyadd" mode, but had it before!
1265            print STDERR "Warning: File $full_curr_file previously imported.\n";
1266            print STDERR "         Treating as new file\n";
1267           
1268            $block_hash->{'new_files'}->{$full_curr_file} = 1;
1269           
1270        }
1271        }
1272    }
1273    else {
1274        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1275        # the new file is the special sort of file greenstone uses
1276        # to attach metadata to src documents
1277        # i.e metadata.xml
1278        # (but note, the filename used is not constrained in
1279        # Greenstone to always be this)
1280
1281        print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1282        $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1283        }
1284        else {
1285        $block_hash->{'new_files'}->{$full_curr_file} = 1;
1286        }
1287    }
1288
1289   
1290    delete $block_hash->{'all_files'}->{$curr_file};
1291    }
1292
1293
1294
1295
1296    # Deal with complication of new or modified metadata files by forcing
1297    # everything from this point down in the file hierarchy to
1298    # be freshly imported. 
1299    #
1300    # This may mean files that have not changed are reindexed, but does
1301    # guarantee by the end of processing all new metadata is correctly
1302    # associated with the relevant document(s).
1303
1304    foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1305    my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1306
1307    $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1308    $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1309   
1310    # Go through existing_files, and mark anything that is contained
1311    # within 'situated_dir' to be reindexed (in case some of the metadata
1312    # attaches to one of these files)
1313
1314    my $reindex_files = [];
1315
1316    foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1317   
1318        if ($existing_f =~ m/^$situated_dir/) {
1319
1320        print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1321
1322        push(@$reindex_files,$existing_f);
1323        $block_hash->{'reindex_files'}->{$existing_f} = 1;
1324        delete $block_hash->{'existing_files'}->{$existing_f};
1325
1326        }
1327    }
1328   
1329    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1330    # (or equivalent)
1331    $block_hash->{'new_files'}->{$new_mdf} = 1;
1332
1333    }
1334
1335    # go through remaining existing files and work out what has changed and needs to be reindexed.
1336    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1337
1338    my $reindex_files = [];
1339
1340    foreach my $existing_filename (@existing_files) {
1341    if (-M $existing_filename < $archiveinf_timestamp) {
1342        # file is newer than last build
1343       
1344        my $existing_file = $existing_filename;
1345        #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1346
1347        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1348        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1349       
1350        print STDERR "**** Reindexing existing file: $existing_file\n";
1351
1352        push(@$reindex_files,$existing_file);
1353        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1354    }
1355
1356    }
1357
1358   
1359    # By this point full_prev_all_files contains the files
1360    # mentioned in archiveinf-src.db but are not in the 'import'
1361    # folder (or whatever was specified through -importdir ...)
1362
1363    # This list can contain files that were created in the 'tmp' or
1364    # 'cache' areas (such as screen-size and thumbnail images).
1365    #
1366    # In building the final list of files to delete, we test to see if
1367    # it exists on the filesystem and if it does (unusual for a "normal"
1368    # file in import, but possible in the case of 'tmp' files),
1369    # supress it from going into the final list
1370
1371    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1372
1373    my @deleted_files = values %$full_prev_all_files;
1374    map { my $curr_file = $_;
1375      my $full_curr_file = $curr_file;
1376
1377      if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1378          # add in import dir to make absolute
1379
1380          $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1381      }
1382
1383
1384      if (!-e $full_curr_file) {
1385          $block_hash->{'deleted_files'}->{$curr_file} = 1;
1386      }
1387      } @deleted_files;
1388
1389
1390
1391}
1392
1393
1394# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1395# $mode is 'delete' or 'reindex'
1396sub mark_docs_for_deletion
1397{
1398    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1399
1400    my $mode_text = "deleted from index";
1401    if ($mode eq "reindex") {
1402    $mode_text = "reindexed";
1403    }
1404
1405    # Get the infodbtype value for this collection from the arcinfo object
1406    my $infodbtype = $archive_info->{'infodbtype'};
1407
1408    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1409    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1410
1411
1412    # record files marked for deletion in arcinfo
1413    foreach my $file (@$deleted_files) {
1414    # use 'archiveinf-src' info database file to look up all the OIDs
1415    # that this file is used in (note in most cases, it's just one OID)
1416   
1417    my $relfile = &util::abspath_to_placeholders($file);
1418
1419    my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $relfile);
1420    my $oids = $src_rec->{'oid'};
1421    my $file_record_deleted = 0;
1422
1423    # delete the src record
1424    my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1425    &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $relfile);
1426    &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1427
1428
1429    foreach my $oid (@$oids) {
1430
1431        # find the source doc (the primary file that becomes this oid)
1432        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1433        my $doc_source_file = $doc_rec->{'src-file'}->[0];
1434        $doc_source_file = &util::placeholders_to_abspath($doc_source_file);
1435
1436        if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1437        $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1438        }
1439
1440        if ($doc_source_file ne $file) {
1441        # its an associated or metadata file
1442       
1443        # mark source doc for reimport as one of its assoc files has changed or deleted
1444        $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1445       
1446        }
1447        my $curr_status = $archive_info->get_status_info($oid);
1448        if (defined($curr_status) && (($curr_status ne "D"))) {
1449        if ($verbosity>1) {
1450            print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1451        }
1452        # mark oid for deletion (it will be deleted or reimported)
1453        $archive_info->set_status_info($oid,"D");
1454        my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1455        $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1456
1457        my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1458        my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1459
1460        &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1461        &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1462        }
1463    }
1464   
1465    }
1466
1467    # now go through and check that we haven't marked any primary
1468    # files for reindex (because their associated files have
1469    # changed/deleted) when they have been deleted themselves. only in
1470    # delete mode.
1471
1472    if ($mode eq "delete") {
1473    foreach my $file (@$deleted_files) {
1474        if (defined $block_hash->{'reindex_files'}->{$file}) {
1475        delete $block_hash->{'reindex_files'}->{$file};
1476        }
1477    }
1478    }
1479
1480
1481}
1482
1483sub add_dir_contents_to_list {
1484
1485    my ($dirname, $list) = @_;
1486 
1487    # Recur over directory contents.
1488    my (@dir, $subfile);
1489   
1490    # find all the files in the directory
1491    if (!opendir (DIR, $dirname)) {
1492    print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1493    return -1; # error in processing
1494    }
1495    @dir = readdir (DIR);
1496    closedir (DIR);
1497   
1498    for (my $i = 0; $i < scalar(@dir); $i++) {
1499    my $subfile = $dir[$i];
1500    next if ($subfile =~ m/^\.\.?$/);
1501    next if ($subfile =~ /^\.svn$/);
1502    my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1503    if (-d $full_file) {
1504        &add_dir_contents_to_list($full_file, $list);
1505    } else {
1506        push (@$list, $full_file);
1507    }
1508    }
1509   
1510}
1511
1512   
15131;
Note: See TracBrowser for help on using the browser.