source: main/trunk/greenstone2/perllib/inexport.pm@ 31190

Last change on this file since 31190 was 31190, checked in by ak19, 7 years ago

First major commit to do with the new oaiinfo db that keeps track of which oids are deleted. Still need to fix up issue with the new remove and rename subroutines of dbutil's jdbm not being called to clean up *.lg log file associated with main db file. Still need to clean up unused methods in oaiinfo, remove debugging and test agains GS3 incr-build-with-manifest tutorial. (Previous 3 commits were commits, not all related.)

  • Property svn:executable set to *
File size: 48.1 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use File::Basename;
50
51my $oidtype_list =
52 [ { 'name' => "hash",
53 'desc' => "{import.OIDtype.hash}" },
54 { 'name' => "hash_on_full_filename",
55 'desc' => "{import.OIDtype.hash_on_full_filename}" },
56 { 'name' => "assigned",
57 'desc' => "{import.OIDtype.assigned}" },
58 { 'name' => "incremental",
59 'desc' => "{import.OIDtype.incremental}" },
60 { 'name' => "filename",
61 'desc' => "{import.OIDtype.filename}" },
62 { 'name' => "dirname",
63 'desc' => "{import.OIDtype.dirname}" },
64 { 'name' => "full_filename",
65 'desc' => "{import.OIDtype.full_filename}" } ];
66
67$inexport::directory_arguments =
68[
69 { 'name' => "importdir",
70 'desc' => "{import.importdir}",
71 'type' => "string",
72 'reqd' => "no",
73 'deft' => "import",
74 'hiddengli' => "yes" },
75 { 'name' => "collectdir",
76 'desc' => "{import.collectdir}",
77 'type' => "string",
78 # parsearg left "" as default
79 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
80 'deft' => "",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83
84];
85$inexport::arguments =
86[
87 # don't set the default to hash - want to allow this to come from
88 # entry in collect.cfg but want to override it here
89 { 'name' => "OIDtype",
90 'desc' => "{import.OIDtype}",
91 'type' => "enum",
92 'list' => $oidtype_list,
93 'deft' => "hash_on_full_filename",
94 'reqd' => "no",
95 'modegli' => "2" },
96 { 'name' => "OIDmetadata",
97 'desc' => "{import.OIDmetadata}",
98 'type' => "string",
99 'deft' => "dc.Identifier",
100 'reqd' => "no",
101 'modegli' => "2" },
102 { 'name' => "site",
103 'desc' => "{import.site}",
104 'type' => "string",
105 'deft' => "",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "manifest",
109 'desc' => "{import.manifest}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "no",
113 'hiddengli' => "yes" } ,
114 { 'name' => "incremental",
115 'desc' => "{import.incremental}",
116 'type' => "flag",
117 'hiddengli' => "yes" },
118 { 'name' => "keepold",
119 'desc' => "{import.keepold}",
120 'type' => "flag",
121 'reqd' => "no",
122 'hiddengli' => "yes" },
123 { 'name' => "removeold",
124 'desc' => "{import.removeold}",
125 'type' => "flag",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "language",
129 'desc' => "{scripts.language}",
130 'type' => "string",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "maxdocs",
134 'desc' => "{import.maxdocs}",
135 'type' => "int",
136 'reqd' => "no",
137 'deft' => "-1",
138 'range' => "-1,",
139 'modegli' => "1" },
140 { 'name' => "debug",
141 'desc' => "{import.debug}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "faillog",
146 'desc' => "{import.faillog}",
147 'type' => "string",
148 # parsearg left "" as default
149 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
150 'deft' => "",
151 'reqd' => "no",
152 'modegli' => "3" },
153 { 'name' => "out",
154 'desc' => "{import.out}",
155 'type' => "string",
156 'deft' => "STDERR",
157 'reqd' => "no",
158 'hiddengli' => "yes" },
159 { 'name' => "statsfile",
160 'desc' => "{import.statsfile}",
161 'type' => "string",
162 'deft' => "STDERR",
163 'reqd' => "no",
164 'hiddengli' => "yes" },
165 { 'name' => "verbosity",
166 'desc' => "{import.verbosity}",
167 'type' => "int",
168 'range' => "0,",
169 'deft' => "2",
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "gli",
173 'desc' => "{scripts.gli}",
174 'type' => "flag",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "xml",
178 'desc' => "{scripts.xml}",
179 'type' => "flag",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182
183];
184
185sub new
186{
187 my $class = shift (@_);
188 my ($mode,$argv,$options,$opt_listall_options) = @_;
189
190 my $self = { 'xml' => 0, 'mode' => $mode };
191
192 # general options available to all plugins
193 my $arguments = $options->{'args'};
194 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
195 # Parse returns -1 if something has gone wrong
196 if ($intArgLeftinAfterParsing == -1)
197 {
198 &PrintUsage::print_txt_usage($options, "{import.params}",1);
199 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
200 die "\n";
201 }
202
203 my $language = $self->{'language'};
204 # If $language has been specified, load the appropriate resource bundle
205 # (Otherwise, the default resource bundle will be loaded automatically)
206 if ($language && $language =~ /\S/) {
207 &gsprintf::load_language_specific_resource_bundle($language);
208 }
209
210 if ($self->{'listall'}) {
211 if ($self->{'xml'}) {
212 &PrintUsage::print_xml_usage($opt_listall_options);
213 }
214 else
215 {
216 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
217 }
218 die "\n";
219 }
220
221 if ($self->{'xml'}) {
222 &PrintUsage::print_xml_usage($options);
223 print "\n";
224 return bless $self, $class;
225 }
226
227 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
228 &gsprintf::output_strings_in_UTF8;
229 }
230
231 # If the user specified -h, then we output the usage
232 if (@$argv && $argv->[0] =~ /^\-+h/) {
233 &PrintUsage::print_txt_usage($options, "{import.params}");
234 die "\n";
235 }
236 # now check that we had exactly one leftover arg, which should be
237 # the collection name. We don't want to do this earlier, cos
238 # -xml arg doesn't need a collection name
239
240 if ($intArgLeftinAfterParsing != 1 )
241 {
242 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
243 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
244 die "\n";
245 }
246
247 $self->{'close_out'} = 0;
248 my $out = $self->{'out'};
249 if ($out !~ /^(STDERR|STDOUT)$/i) {
250 open (OUT, ">$out") ||
251 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
252 $out = 'inexport::OUT';
253 $self->{'close_out'} = 1;
254 }
255 $out->autoflush(1);
256 $self->{'out'} = $out;
257
258 my $statsfile = $self->{'statsfile'};
259 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
260 open (STATSFILE, ">$statsfile") ||
261 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
262 $statsfile = 'inexport::STATSFILE';
263 $self->{'close_stats'} = 1;
264 }
265 $statsfile->autoflush(1);
266 $self->{'statsfile'} = $statsfile;
267
268 # @ARGV should be only one item, the name of the collection
269 $self->{'collection'} = shift @$argv;
270
271 # Unless otherwise stated all manifests are considered version 1---where
272 # they act more like an advanced process expression---as compared to newer
273 # manifest files that act as an explicit (and exhaustive) list of files to
274 # process [jmt12]
275 $self->{'manifest_version'} = 1;
276
277 return bless $self, $class;
278}
279
280# Simplified version of the contstructor for use with CGI scripts
281sub newCGI
282{
283 my $class = shift (@_);
284 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
285
286 my $self = { 'xml' => 0, 'mode' => $mode };
287
288 $self->{'out'} = STDERR;
289
290 if (defined $gsdl_cgi) {
291 $self->{'site'} = $opt_site;
292 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
293 $self->{'collectdir'} = $collect_dir;
294 }
295 else {
296 $self->{'site'} = "";
297 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
298 }
299 $self->{'faillog'} = "";
300
301 $self->{'collection'} = $collect;
302
303 return bless $self, $class;
304}
305sub get_collection
306{
307 my $self = shift @_;
308
309 return $self->{'collection'};
310}
311
312
313sub read_collection_cfg
314{
315 my $self = shift @_;
316 my ($collection,$options) = @_;
317
318 my $collectdir = $self->{'collectdir'};
319 my $site = $self->{'site'};
320 my $out = $self->{'out'};
321
322 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
323 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
324 die "\n";
325 }
326
327 # set gs_version 2/3
328 $self->{'gs_version'} = "2";
329 if ((defined $site) && ($site ne "")) {
330 # gs3
331 $self->{'gs_version'} = "3";
332 }
333
334 # add collection's perllib dir into include path in
335 # case we have collection specific modules
336 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
337
338 # check that we can open the faillog
339 my $faillog = $self->{'faillog'};
340 if ($faillog eq "") {
341 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
342 }
343 open (FAILLOG, ">$faillog") ||
344 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
345
346
347 my $faillogname = $faillog;
348 $faillog = 'inexport::FAILLOG';
349 $faillog->autoflush(1);
350 $self->{'faillog'} = $faillog;
351 $self->{'faillogname'} = $faillogname;
352 $self->{'close_faillog'} = 1;
353
354 # Read in the collection configuration file.
355 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
356 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
357
358 # store the config file's name, so oaiinfo object constructor can be instantiated with it
359 $self->{'config_filename'} = $config_filename;
360
361 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
362
363 return ($config_filename,$collectcfg);
364}
365
366sub set_collection_options
367{
368 my $self = shift @_;
369 my ($collectcfg) = @_;
370
371 my $inexport_mode = $self->{'mode'};
372
373 my $importdir = $self->{'importdir'};
374 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
375 my $out = $self->{'out'};
376
377 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
378 if (!defined($collectcfg->{'infodbtype'}))
379 {
380 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
381 }
382 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
383 # we can't use the text version for archives dbs.
384 $collectcfg->{'infodbtype'} = "gdbm";
385 }
386
387 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
388 $importdir = $collectcfg->{'importdir'};
389 }
390
391 if ($inexport_mode eq "import") {
392 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
393 $archivedir = $collectcfg->{'archivedir'};
394 }
395 }
396 elsif ($inexport_mode eq "export") {
397 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
398 $archivedir = $collectcfg->{'exportdir'};
399 }
400 }
401 # fill in the default import and archives directories if none
402 # were supplied, turn all \ into / and remove trailing /
403 if (!&FileUtils::isFilenameAbsolute($importdir))
404 {
405 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
406 }
407 else
408 {
409 # Don't do this - it kills protocol prefixes
410 #$importdir =~ s/[\\\/]+/\//g;
411 #$importdir =~ s/\/$//;
412 # Do this instead
413 &FileUtils::sanitizePath($importdir);
414 }
415 if (!&FileUtils::directoryExists($importdir))
416 {
417 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
418 die "\n";
419 }
420 $self->{'importdir'} = $importdir;
421
422 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
423 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
424 }
425 else {
426
427 $archivedir = &FileUtils::sanitizePath($archivedir);
428 }
429 $self->{'archivedir'} = $archivedir;
430
431 if (defined $self->{'default_verbosity'}) {
432 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
433 $self->{'verbosity'} = $collectcfg->{'verbosity'};
434 }
435 }
436
437 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
438 $self->{'manifest'} = $collectcfg->{'manifest'};
439 }
440
441 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
442 if ($collectcfg->{'gzip'} =~ /^true$/i) {
443 $self->{'gzip'} = 1;
444 }
445 }
446
447 if (defined $self->{'default_maxdocs'}) {
448 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
449 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
450 }
451 }
452
453
454
455 if (defined $self->{'default_OIDtype'} ) {
456 if (defined $collectcfg->{'OIDtype'}
457 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
458 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
459 }
460 }
461
462 if (defined $self->{'default_OIDmetadata'}) {
463 if (defined $collectcfg->{'OIDmetadata'}) {
464 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
465 }
466 }
467
468 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
469 $self->{'debug'} = 1;
470 }
471 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
472 $self->{'gli'} = 1;
473 }
474 $self->{'gli'} = 0 unless defined $self->{'gli'};
475
476 # check keepold and removeold
477 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
478
479 my ($removeold, $keepold, $incremental, $incremental_mode)
480 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
481 $self->{'incremental'}, $checkdir,
482 $collectcfg);
483
484 $self->{'removeold'} = $removeold;
485 $self->{'keepold'} = $keepold;
486 $self->{'incremental'} = $incremental;
487 $self->{'incremental_mode'} = $incremental_mode;
488
489 # Since this wasted my morning, let's at least warn a user that manifest
490 # files now *only* work if keepold is set [jmt12]
491 if ($self->{'manifest'} && (!$keepold || !$incremental))
492 {
493 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
494 }
495 }
496
497sub process_files
498{
499 my $self = shift @_;
500 my ($config_filename,$collectcfg) = @_;
501
502 my $inexport_mode = $self->{'mode'};
503
504 my $verbosity = $self->{'verbosity'};
505 my $debug = $self->{'debug'};
506
507 my $importdir = $self->{'importdir'};
508 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
509
510 my $incremental = $self->{'incremental'};
511 my $incremental_mode = $self->{'incremental_mode'};
512
513 my $gs_version = $self->{'gs_version'};
514
515 my $removeold = $self->{'removeold'};
516 my $keepold = $self->{'keepold'};
517
518 my $saveas = $self->{'saveas'};
519 my $saveas_options = $self->{'saveas_options'};
520 my $OIDtype = $self->{'OIDtype'};
521 my $OIDmetadata = $self->{'OIDmetadata'};
522
523 my $out = $self->{'out'};
524 my $faillog = $self->{'faillog'};
525
526 my $maxdocs = $self->{'maxdocs'};
527 my $gzip = $self->{'gzip'};
528 my $groupsize = $self->{'groupsize'};
529 my $sortmeta = $self->{'sortmeta'};
530
531 my $removeprefix = $self->{'removeprefix'};
532 my $removesuffix = $self->{'removesuffix'};
533
534 my $gli = $self->{'gli'};
535
536 # related to export
537 my $xsltfile = $self->{'xsltfile'};
538 my $group_marc = $self->{'group_marc'};
539 my $mapping_file = $self->{'mapping_file'};
540 my $xslt_mets = $self->{'xslt_mets'};
541 my $xslt_txt = $self->{'xslt_txt'};
542 my $fedora_namespace = $self->{'fedora_namespace'};
543 my $metadata_prefix = $self->{'metadata_prefix'};
544
545 if ($inexport_mode eq "import") {
546 print STDERR "<Import>\n" if $gli;
547 }
548 else {
549 print STDERR "<export>\n" if $gli;
550 }
551
552 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
553 if ($self->{'manifest'} ne "") {
554 my $manifest_filename = $self->{'manifest'};
555
556 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
557 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
558 }
559 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
560 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
561 #$self->{'manifest'} =~ s/\/$//;
562
563 $manifest_lookup->parse($manifest_filename);
564
565 # manifests may now include a version number [jmt12]
566 $self->{'manifest_version'} = $manifest_lookup->get_version();
567 }
568
569 my $manifest = $self->{'manifest'};
570
571 # load all the plugins
572 my $plugins = [];
573 if (defined $collectcfg->{'plugin'}) {
574 $plugins = $collectcfg->{'plugin'};
575 }
576
577 my $plugin_incr_mode = $incremental_mode;
578 if ($manifest ne "") {
579 # if we have a manifest file, then we pretend we are fully incremental for plugins
580 $plugin_incr_mode = "all";
581 }
582 #some global options for the plugins
583 my @global_opts = ();
584
585 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
586 if (scalar(@$pluginfo) == 0) {
587 &gsprintf($out, "{import.no_plugins_loaded}\n");
588 die "\n";
589 }
590
591 # remove the old contents of the archives directory (and tmp
592 # directory) if needed
593
594 if ($removeold) {
595 if (&FileUtils::directoryExists($archivedir)) {
596 &gsprintf($out, "{import.removing_archives}\n");
597 &FileUtils::removeFilesRecursive($archivedir);
598 }
599 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
600 $tmpdir =~ s/[\\\/]+/\//g;
601 $tmpdir =~ s/\/$//;
602 if (&FileUtils::directoryExists($tmpdir)) {
603 &gsprintf($out, "{import.removing_tmpdir}\n");
604 &FileUtils::removeFilesRecursive($tmpdir);
605 }
606 }
607
608 # create the archives dir if needed
609 &FileUtils::makeAllDirectories($archivedir);
610
611 # read the archive information file
612
613 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
614 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
615 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
616
617 # When we make these initial calls to determine the archive information doc
618 # and src databases we pass through a '1' to indicate this is the first
619 # time we are referring to these databases. When using dynamic dbutils
620 # (available in extensions) this indicates to some database types (for
621 # example, persistent servers) that this is a good time to perform any
622 # one time initialization. The argument has no effect on vanilla dbutils
623 # [jmt12]
624 my $perform_firsttime_init = 1;
625 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
626 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
627
628 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
629 $archive_info->load_info ($arcinfo_doc_filename);
630
631 if ($manifest eq "") {
632 # Load in list of files in import folder from last import (if present)
633 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
634 }
635
636 ####Use Plugout####
637 my $plugout;
638
639 my $generate_auxiliary_files = 0;
640 if ($inexport_mode eq "import") {
641 $generate_auxiliary_files = 1;
642 }
643 elsif ($self->{'include_auxiliary_database_files'}) {
644 $generate_auxiliary_files = 1;
645 }
646 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
647
648 # Option to use user defined plugout
649 if ($inexport_mode eq "import") {
650 if (defined $collectcfg->{'plugout'}) {
651 # If a plugout was specified in the collect.cfg file, assume it is sensible
652 # We can't check the name because it could be anything, if it is a custom plugout
653 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
654 $plugout = $collectcfg->{'plugout'};
655 }
656 else {
657 push @$plugout,$saveas."Plugout";
658 }
659
660 }
661 else {
662 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
663 $plugout = $collectcfg->{'plugout'};
664 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
665 }
666 else {
667 push @$plugout,$saveas."Plugout";
668 }
669 }
670
671 my $plugout_name = $plugout->[0];
672
673 if ($inexport_mode eq "export" && defined $saveas_options) {
674 my @user_plugout_options = split(" ", $saveas_options);
675 push @$plugout, @user_plugout_options;
676 }
677 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
678 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
679 push @$plugout,("-debug") if ($debug);
680 push @$plugout,("-gzip_output") if ($gzip);
681 push @$plugout,("-output_handle",$out) if (defined $out);
682
683 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
684 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
685 if ($inexport_mode eq "import") {
686 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
687 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
688 }
689 }
690 my $processor = &plugout::load_plugout($plugout);
691 $processor->setoutputdir ($archivedir);
692 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
693 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
694 $processor->begin();
695 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
696
697 if ($removeold) {
698 # occasionally, plugins may want to do something on remove
699 # old, eg pharos image indexing
700 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
701 }
702
703 # process the import directory
704 my $block_hash = {};
705 $block_hash->{'new_files'} = {};
706 $block_hash->{'reindex_files'} = {};
707 # all of these are set somewhere else, so it's more readable to define them
708 # here [jmt12]
709 $block_hash->{'all_files'} = {};
710 $block_hash->{'deleted_files'} = {};
711 $block_hash->{'file_blocks'} = {};
712 $block_hash->{'metadata_files'} = {};
713 $block_hash->{'shared_fileroot'} = '';
714 # a new flag so we can tell we had a manifest way down in the plugins
715 # [jmt12]
716 $block_hash->{'manifest'} = 'false';
717 my $metadata = {};
718
719 # global blocking pass may set up some metadata
720 # does this set up metadata?????
721 # - when we have a newer manifest file we don't do this -unless- the
722 # collection configuration indicates this collection contains complex
723 # (inherited) metadata [jmt12]
724 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
725 {
726 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
727 }
728 else
729 {
730 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
731 }
732
733
734 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
735 # of the OAI identifiers with their time stamps and deleted status.
736 my $oai_info = new oaiinfo(&util::get_parent_folder($self->{'importdir'}), $collectcfg->{'infodbtype'});
737 my $have_manifest = ($manifest eq '') ? 0 : 1;
738 $oai_info->import_stage($removeold, $have_manifest);
739
740
741 if ($manifest ne "") {
742
743 # mark that we are using a manifest - information that might be needed
744 # down in plugins (for instance DirectoryPlugin)
745 $block_hash->{'manifest'} = $self->{'manifest_version'};
746
747 #
748 # 1. Process delete files first
749 #
750 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
751 my @full_deleted_files = ();
752
753 # ensure all filenames are absolute
754 foreach my $df (@deleted_files) {
755 my $full_df =
756 (&FileUtils::isFilenameAbsolute($df))
757 ? $df
758 : &FileUtils::filenameConcatenate($importdir,$df);
759
760 if (-d $full_df) {
761 &add_dir_contents_to_list($full_df, \@full_deleted_files);
762 } else {
763 push(@full_deleted_files,$full_df);
764 }
765 }
766
767 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
768 mark_docs_for_deletion($archive_info,{},
769 \@full_deleted_files,
770 $archivedir, $verbosity, "delete");
771
772
773 #
774 # 2. Now files for reindexing
775 #
776
777 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
778 my @full_reindex_files = ();
779 # ensure all filenames are absolute
780 foreach my $rf (@reindex_files) {
781 my $full_rf =
782 (&FileUtils::isFilenameAbsolute($rf))
783 ? $rf
784 : &FileUtils::filenameConcatenate($importdir,$rf);
785
786 if (-d $full_rf) {
787 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
788 } else {
789 push(@full_reindex_files,$full_rf);
790 }
791 }
792
793 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
794 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
795
796 # And now to ensure the new version of the file processed by
797 # appropriate plugin, we need to add it to block_hash reindex list
798 foreach my $full_rf (@full_reindex_files) {
799 $block_hash->{'reindex_files'}->{$full_rf} = 1;
800 }
801
802
803 #
804 # 3. Now finally any new files - add to block_hash new_files list
805 #
806
807 my @new_files = keys %{$manifest_lookup->{'index'}};
808 my @full_new_files = ();
809
810 foreach my $nf (@new_files) {
811 # ensure filename is absolute
812 my $full_nf =
813 (&FileUtils::isFilenameAbsolute($nf))
814 ? $nf
815 : &FileUtils::filenameConcatenate($importdir,$nf);
816
817 if (-d $full_nf) {
818 &add_dir_contents_to_list($full_nf, \@full_new_files);
819 } else {
820 push(@full_new_files,$full_nf);
821 }
822 }
823
824 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
825 # need to check this file exists before trying to read it - in the past
826 # it wasn't possible to have a manifest unless keepold was also set so
827 # you were pretty much guaranteed arcinfo existed
828 # [jmt12]
829 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
830 if (-e $arcinfo_src_filename)
831 {
832 my $arcinfodb_map = {};
833 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
834 foreach my $f (@full_new_files) {
835 my $rel_f = &util::abspath_to_placeholders($f);
836
837 # check that we haven't seen it already
838 if (defined $arcinfodb_map->{$rel_f}) {
839 # TODO make better warning
840 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
841 } else {
842 $block_hash->{'new_files'}->{$f} = 1;
843 }
844 }
845
846 undef $arcinfodb_map;
847 }
848 # no existing files - so we can just add all the files [jmt12]
849 else
850 {
851 foreach my $f (@full_new_files)
852 {
853 $block_hash->{'new_files'}->{$f} = 1;
854 }
855 }
856
857 # If we are not using complex inherited metadata (and thus have skipped
858 # the global file scan) we need to at least check for a matching
859 # metadata.xml for the files being indexed/reindexed
860 # - unless we are using the newer version of Manifests, which are treated
861 # verbatim, and should have a metadata element for metadata files (so
862 # we can explicitly process metadata files other than metadata.xml)
863 # [jmt12]
864 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
865 {
866 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
867 foreach my $file_to_import (@all_files_to_import)
868 {
869 my $metadata_xml_path = $file_to_import;
870 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
871 if (&FileUtils::fileExists($metadata_xml_path))
872 {
873 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
874 }
875 }
876 }
877
878 # new version manifest files explicitly list metadata files to be
879 # processed (ignoring complexmeta if set)
880 # [jmt12]
881 if ($self->{'manifest_version'} > 1)
882 {
883 # Process metadata files
884 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
885 {
886 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
887 }
888 }
889 } # end if (manifest ne "")
890 else {
891 # if incremental, we read through the import folder to see whats changed.
892
893 if ($incremental || $incremental_mode eq "onlyadd") {
894 prime_doc_oid_count($archivedir);
895
896 # Can now work out which files were new, already existed, and have
897 # been deleted
898
899 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
900 $archivedir,$verbosity,$incremental_mode);
901
902 my @new_files = sort keys %{$block_hash->{'new_files'}};
903 if (scalar(@new_files>0)) {
904 print STDERR "New files and modified metadata files since last import:\n ";
905 print STDERR join("\n ",@new_files), "\n";
906 }
907
908 if ($incremental) {
909 # only look for deletions if we are truely incremental
910 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
911 # Filter out any in gsdl/tmp area
912 my @filtered_deleted_files = ();
913 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
914 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
915 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
916 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
917
918 foreach my $df (@deleted_files) {
919 next if ($df =~ m/^$gsdl_tmp_area/);
920 next if ($df =~ m/^$collect_tmp_area/);
921
922 push(@filtered_deleted_files,$df);
923 }
924
925
926 @deleted_files = @filtered_deleted_files;
927
928 if (scalar(@deleted_files)>0) {
929 print STDERR "Files deleted since last import:\n ";
930 print STDERR join("\n ",@deleted_files), "\n";
931
932
933 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
934
935 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
936 }
937
938 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
939
940 if (scalar(@reindex_files)>0) {
941 print STDERR "Files to reindex since last import:\n ";
942 print STDERR join("\n ",@reindex_files), "\n";
943 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
944 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
945 }
946
947 }
948 } # end if incremental/only_add mode
949 # else no manifest AND not incremental
950 } # end if else block of manifest ne "" else eq ""
951
952 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
953 # Do nothing if the file already exists (file exists on incremental build).
954 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
955 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
956 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
957 # oailastmodified and oailastmodifieddate
958 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
959 if ($self->{'generate_auxiliary_files'}) {
960 if (!-f $earliestDatestampFile && -d $archivedir) {
961 my $current_time_in_seconds = time; # in seconds
962
963 if(open(FOUT, ">$earliestDatestampFile")) {
964 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
965 print FOUT $current_time_in_seconds;
966 close(FOUT);
967 }
968 else {
969 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
970 }
971
972 }
973 }
974
975 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
976
977 if ($saveas eq "FedoraMETS") {
978 # create collection "doc obj" for Fedora that contains
979 # collection-level metadata
980
981 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
982 $doc_obj->set_OID("collection");
983
984 my $col_name = undef;
985 my $col_meta = $collectcfg->{'collectionmeta'};
986
987 if (defined $col_meta) {
988 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
989 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
990 }
991 $processor->process($doc_obj);
992 }
993
994 &plugin::end($pluginfo, $processor);
995
996 &plugin::deinit($pluginfo, $processor);
997
998 # Store the value of OIDCount (used in doc.pm) so it can be
999 # restored correctly to this value on an incremental build
1000 # - this OIDcount file should only be generated for numerical oids [jmt12]
1001 if ($self->{'OIDtype'} eq 'incremental')
1002 {
1003 store_doc_oid_count($archivedir);
1004 }
1005
1006 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1007 $processor->close_group_output() if $processor->is_group();
1008
1009# if ($inexport_mode eq "import") {
1010 if ($self->{'generate_auxiliary_files'}) {
1011 # write out the archive information file
1012 # for backwards compatability with archvies.inf file
1013 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1014 $archive_info->save_info($arcinfo_doc_filename);
1015 }
1016 else {
1017 $archive_info->save_revinfo_db($arcinfo_src_filename);
1018 }
1019 }
1020 return $pluginfo;
1021}
1022
1023# @function perform_process_files()
1024# while process_files() above prepares the system to import files this is the
1025# function that actually initiates the plugin pipeline to process the files.
1026# This function the therefore be overridden in subclasses of inexport.pm should
1027# they wish to do different or further processing
1028# @author jmt12
1029sub perform_process_files
1030{
1031 my $self = shift(@_);
1032 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1033 my $gli = $self->{'gli'};
1034 # specific file to process - via manifest version 2+
1035 if ($file_to_import ne '')
1036 {
1037 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1038 }
1039 # global file scan - if we are using a new version manifest, files would have
1040 # been read above. Older manifests use extra settings in the $block_hash to
1041 # control what is imported, while non-manifest imports use a regular
1042 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1043 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1044 {
1045 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1046 }
1047 else
1048 {
1049 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1050 }
1051}
1052# perform_process_files()
1053
1054# @function generate_statistics()
1055sub generate_statistics
1056{
1057 my $self = shift @_;
1058 my ($pluginfo) = @_;
1059
1060 my $inexport_mode = $self->{'mode'};
1061 my $out = $self->{'out'};
1062 my $faillogname = $self->{'faillogname'};
1063 my $statsfile = $self->{'statsfile'};
1064 my $gli = $self->{'gli'};
1065
1066 &gsprintf($out, "\n");
1067 &gsprintf($out, "*********************************************\n");
1068 &gsprintf($out, "{$inexport_mode.complete}\n");
1069 &gsprintf($out, "*********************************************\n");
1070
1071 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1072}
1073# generate_statistics()
1074
1075
1076# @function deinit()
1077# Close down any file handles that we opened (and hence are responsible for
1078# closing
1079sub deinit
1080{
1081 my $self = shift(@_);
1082 close OUT if $self->{'close_out'};
1083 close FAILLOG if $self->{'close_faillog'};
1084 close STATSFILE if $self->{'close_statsfile'};
1085}
1086# deinit()
1087
1088
1089sub store_collectionmeta
1090{
1091 my ($collectionmeta,$field,$doc_obj) = @_;
1092
1093 my $section = $doc_obj->get_top_section();
1094
1095 my $field_hash = $collectionmeta->{$field};
1096
1097 foreach my $k (keys %$field_hash)
1098 {
1099 my $val = $field_hash->{$k};
1100
1101 ### print STDERR "*** $k = $field_hash->{$k}\n";
1102
1103 my $md_label = "ex.$field";
1104
1105
1106 if ($k =~ m/^\[l=(.*?)\]$/)
1107 {
1108
1109 my $md_suffix = $1;
1110 $md_label .= "^$md_suffix";
1111 }
1112
1113
1114 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1115
1116 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1117 # while "collectionname" in GS2 is called "name" in GS3.
1118 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1119 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1120 {
1121 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1122 }
1123
1124 }
1125}
1126
1127
1128sub oid_count_file {
1129 my ($archivedir) = @_;
1130 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1131}
1132
1133
1134sub prime_doc_oid_count
1135{
1136 my ($archivedir) = @_;
1137 my $oid_count_filename = &oid_count_file($archivedir);
1138
1139 if (-e $oid_count_filename) {
1140 if (open(OIDIN,"<$oid_count_filename")) {
1141 my $OIDcount = <OIDIN>;
1142 chomp $OIDcount;
1143 close(OIDIN);
1144
1145 $doc::OIDcount = $OIDcount;
1146 }
1147 else {
1148 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1149 }
1150 }
1151
1152}
1153
1154sub store_doc_oid_count
1155{
1156 # Use the file "OIDcount" in the archives directory to record
1157 # what value doc.pm got up to
1158
1159 my ($archivedir) = @_;
1160 my $oid_count_filename = &oid_count_file($archivedir);
1161
1162 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1163 if (open(OIDOUT,">$oid_count_filename")) {
1164 print OIDOUT $doc::OIDcount, "\n";
1165
1166 close(OIDOUT);
1167 }
1168 else {
1169 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1170 }
1171}
1172
1173
1174
1175sub new_vs_old_import_diff
1176{
1177 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1178
1179 # Get the infodbtype value for this collection from the arcinfo object
1180 my $infodbtype = $archive_info->{'infodbtype'};
1181
1182 # in this method, we want to know if metadata files are modified or not.
1183 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1184
1185 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1186
1187 # First convert all files to absolute form
1188 # This is to support the situation where the import folder is not
1189 # the default
1190
1191 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1192 my $full_prev_all_files = {};
1193
1194 foreach my $prev_file (keys %$prev_all_files) {
1195
1196 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1197 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1198 $full_prev_all_files->{$full_prev_file} = $prev_file;
1199 }
1200 else {
1201 $full_prev_all_files->{$prev_file} = $prev_file;
1202 }
1203 }
1204
1205
1206 # Figure out which are the new files, existing files and so
1207 # by implication the files from the previous import that are not
1208 # there any more => mark them for deletion
1209 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1210
1211 my $full_curr_file = $curr_file;
1212
1213 # entry in 'all_files' is moved to either 'existing_files',
1214 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1215
1216 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1217 # add in import dir to make absolute
1218 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1219 }
1220
1221 # figure out if new file or not
1222 if (defined $full_prev_all_files->{$full_curr_file}) {
1223 # delete it so that only files that need deleting are left
1224 delete $full_prev_all_files->{$full_curr_file};
1225
1226 # had it before. is it a metadata file?
1227 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1228
1229 # is it modified??
1230 if (-M $full_curr_file < $archiveinf_timestamp) {
1231 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1232 # its newer than last build
1233 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1234 }
1235 }
1236 else {
1237 if ($incremental_mode eq "all") {
1238
1239 # had it before
1240 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1241
1242 }
1243 else {
1244 # Warning in "onlyadd" mode, but had it before!
1245 print STDERR "Warning: File $full_curr_file previously imported.\n";
1246 print STDERR " Treating as new file\n";
1247
1248 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1249
1250 }
1251 }
1252 }
1253 else {
1254 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1255 # the new file is the special sort of file greenstone uses
1256 # to attach metadata to src documents
1257 # i.e metadata.xml
1258 # (but note, the filename used is not constrained in
1259 # Greenstone to always be this)
1260
1261 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1262 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1263 }
1264 else {
1265 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1266 }
1267 }
1268
1269
1270 delete $block_hash->{'all_files'}->{$curr_file};
1271 }
1272
1273
1274
1275
1276 # Deal with complication of new or modified metadata files by forcing
1277 # everything from this point down in the file hierarchy to
1278 # be freshly imported.
1279 #
1280 # This may mean files that have not changed are reindexed, but does
1281 # guarantee by the end of processing all new metadata is correctly
1282 # associated with the relevant document(s).
1283
1284 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1285 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1286
1287 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1288 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1289
1290 # Go through existing_files, and mark anything that is contained
1291 # within 'situated_dir' to be reindexed (in case some of the metadata
1292 # attaches to one of these files)
1293
1294 my $reindex_files = [];
1295
1296 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1297
1298 if ($existing_f =~ m/^$situated_dir/) {
1299
1300# print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1301
1302 push(@$reindex_files,$existing_f);
1303 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1304 delete $block_hash->{'existing_files'}->{$existing_f};
1305
1306 }
1307 }
1308
1309 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1310 # (or equivalent)
1311 $block_hash->{'new_files'}->{$new_mdf} = 1;
1312
1313 }
1314
1315 # go through remaining existing files and work out what has changed and needs to be reindexed.
1316 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1317
1318 my $reindex_files = [];
1319
1320 foreach my $existing_filename (@existing_files) {
1321 if (-M $existing_filename < $archiveinf_timestamp) {
1322 # file is newer than last build
1323
1324 my $existing_file = $existing_filename;
1325 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1326
1327 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1328 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1329
1330 print STDERR "**** Reindexing existing file: $existing_file\n";
1331
1332 push(@$reindex_files,$existing_file);
1333 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1334 }
1335
1336 }
1337
1338
1339 # By this point full_prev_all_files contains the files
1340 # mentioned in archiveinf-src.db but are not in the 'import'
1341 # folder (or whatever was specified through -importdir ...)
1342
1343 # This list can contain files that were created in the 'tmp' or
1344 # 'cache' areas (such as screen-size and thumbnail images).
1345 #
1346 # In building the final list of files to delete, we test to see if
1347 # it exists on the filesystem and if it does (unusual for a "normal"
1348 # file in import, but possible in the case of 'tmp' files),
1349 # supress it from going into the final list
1350
1351 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1352
1353 my @deleted_files = values %$full_prev_all_files;
1354 map { my $curr_file = $_;
1355 my $full_curr_file = $curr_file;
1356
1357 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1358 # add in import dir to make absolute
1359
1360 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1361 }
1362
1363
1364 if (!-e $full_curr_file) {
1365 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1366 }
1367 } @deleted_files;
1368
1369
1370
1371}
1372
1373
1374# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1375# $mode is 'delete' or 'reindex'
1376sub mark_docs_for_deletion
1377{
1378 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1379
1380 my $mode_text = "deleted from index";
1381 if ($mode eq "reindex") {
1382 $mode_text = "reindexed";
1383 }
1384
1385 # Get the infodbtype value for this collection from the arcinfo object
1386 my $infodbtype = $archive_info->{'infodbtype'};
1387
1388 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1389 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1390
1391
1392 # record files marked for deletion in arcinfo
1393 foreach my $file (@$deleted_files) {
1394 # use 'archiveinf-src' info database file to look up all the OIDs
1395 # that this file is used in (note in most cases, it's just one OID)
1396
1397 my $relfile = &util::abspath_to_placeholders($file);
1398
1399 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $relfile);
1400 my $oids = $src_rec->{'oid'};
1401 my $file_record_deleted = 0;
1402
1403 # delete the src record
1404 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1405 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $relfile);
1406 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1407
1408
1409 foreach my $oid (@$oids) {
1410
1411 # find the source doc (the primary file that becomes this oid)
1412 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1413 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1414 $doc_source_file = &util::placeholders_to_abspath($doc_source_file);
1415
1416 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1417 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1418 }
1419
1420 if ($doc_source_file ne $file) {
1421 # its an associated or metadata file
1422
1423 # mark source doc for reimport as one of its assoc files has changed or deleted
1424 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1425
1426 }
1427 my $curr_status = $archive_info->get_status_info($oid);
1428 if (defined($curr_status) && (($curr_status ne "D"))) {
1429 if ($verbosity>1) {
1430 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1431 }
1432 # mark oid for deletion (it will be deleted or reimported)
1433 $archive_info->set_status_info($oid,"D");
1434 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1435 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1436
1437 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1438 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1439
1440 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1441 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1442 }
1443 }
1444
1445 }
1446
1447 # now go through and check that we haven't marked any primary
1448 # files for reindex (because their associated files have
1449 # changed/deleted) when they have been deleted themselves. only in
1450 # delete mode.
1451
1452 if ($mode eq "delete") {
1453 foreach my $file (@$deleted_files) {
1454 if (defined $block_hash->{'reindex_files'}->{$file}) {
1455 delete $block_hash->{'reindex_files'}->{$file};
1456 }
1457 }
1458 }
1459
1460
1461}
1462
1463sub add_dir_contents_to_list {
1464
1465 my ($dirname, $list) = @_;
1466
1467 # Recur over directory contents.
1468 my (@dir, $subfile);
1469
1470 # find all the files in the directory
1471 if (!opendir (DIR, $dirname)) {
1472 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1473 return -1; # error in processing
1474 }
1475 @dir = readdir (DIR);
1476 closedir (DIR);
1477
1478 for (my $i = 0; $i < scalar(@dir); $i++) {
1479 my $subfile = $dir[$i];
1480 next if ($subfile =~ m/^\.\.?$/);
1481 next if ($subfile =~ /^\.svn$/);
1482 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1483 if (-d $full_file) {
1484 &add_dir_contents_to_list($full_file, $list);
1485 } else {
1486 push (@$list, $full_file);
1487 }
1488 }
1489
1490}
1491
1492
14931;
Note: See TracBrowser for help on using the repository browser.