source: gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm@ 30354

Last change on this file since 30354 was 30354, checked in by jmt12, 8 years ago

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

  • Property svn:executable set to *
File size: 48.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49use Scalar::Util 'blessed';
50
51my $oidtype_list =
52 [ { 'name' => "hash",
53 'desc' => "{import.OIDtype.hash}" },
54 { 'name' => "hash_on_full_filename",
55 'desc' => "{import.OIDtype.hash_on_full_filename}" },
56 { 'name' => "assigned",
57 'desc' => "{import.OIDtype.assigned}" },
58 { 'name' => "incremental",
59 'desc' => "{import.OIDtype.incremental}" },
60 { 'name' => "filename",
61 'desc' => "{import.OIDtype.filename}" },
62 { 'name' => "dirname",
63 'desc' => "{import.OIDtype.dirname}" },
64 { 'name' => "full_filename",
65 'desc' => "{import.OIDtype.full_filename}" } ];
66
67$inexport::directory_arguments =
68[
69 { 'name' => "importdir",
70 'desc' => "{import.importdir}",
71 'type' => "string",
72 'reqd' => "no",
73 'deft' => "import",
74 'hiddengli' => "yes" },
75 { 'name' => "collectdir",
76 'desc' => "{import.collectdir}",
77 'type' => "string",
78 # parsearg left "" as default
79 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
80 'deft' => "",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83
84];
85$inexport::arguments =
86[
87 # don't set the default to hash - want to allow this to come from
88 # entry in collect.cfg but want to override it here
89 { 'name' => "OIDtype",
90 'desc' => "{import.OIDtype}",
91 'type' => "enum",
92 'list' => $oidtype_list,
93 'deft' => "hash_on_full_filename",
94 'reqd' => "no",
95 'modegli' => "2" },
96 { 'name' => "OIDmetadata",
97 'desc' => "{import.OIDmetadata}",
98 'type' => "string",
99 'deft' => "dc.Identifier",
100 'reqd' => "no",
101 'modegli' => "2" },
102 { 'name' => "site",
103 'desc' => "{import.site}",
104 'type' => "string",
105 'deft' => "",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "manifest",
109 'desc' => "{import.manifest}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "no",
113 'hiddengli' => "yes" } ,
114 { 'name' => "incremental",
115 'desc' => "{import.incremental}",
116 'type' => "flag",
117 'hiddengli' => "yes" },
118 { 'name' => "keepold",
119 'desc' => "{import.keepold}",
120 'type' => "flag",
121 'reqd' => "no",
122 'hiddengli' => "yes" },
123 { 'name' => "removeold",
124 'desc' => "{import.removeold}",
125 'type' => "flag",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "language",
129 'desc' => "{scripts.language}",
130 'type' => "string",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "maxdocs",
134 'desc' => "{import.maxdocs}",
135 'type' => "int",
136 'reqd' => "no",
137 'deft' => "-1",
138 'range' => "-1,",
139 'modegli' => "1" },
140 { 'name' => "debug",
141 'desc' => "{import.debug}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "faillog",
146 'desc' => "{import.faillog}",
147 'type' => "string",
148 # parsearg left "" as default
149 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
150 'deft' => "",
151 'reqd' => "no",
152 'modegli' => "3" },
153 { 'name' => "out",
154 'desc' => "{import.out}",
155 'type' => "string",
156 'deft' => "STDERR",
157 'reqd' => "no",
158 'hiddengli' => "yes" },
159 { 'name' => "statsfile",
160 'desc' => "{import.statsfile}",
161 'type' => "string",
162 'deft' => "STDERR",
163 'reqd' => "no",
164 'hiddengli' => "yes" },
165 { 'name' => "verbosity",
166 'desc' => "{import.verbosity}",
167 'type' => "int",
168 'range' => "0,",
169 'deft' => "2",
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "gli",
173 'desc' => "{scripts.gli}",
174 'type' => "flag",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "xml",
178 'desc' => "{scripts.xml}",
179 'type' => "flag",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182
183];
184
185sub new
186{
187 my $class = shift (@_);
188 my ($mode,$argv,$options,$opt_listall_options) = @_;
189
190 my $self = { 'xml' => 0, 'mode' => $mode };
191
192 # general options available to all plugins
193 my $arguments = $options->{'args'};
194 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
195 # Parse returns -1 if something has gone wrong
196 if ($intArgLeftinAfterParsing == -1)
197 {
198 &PrintUsage::print_txt_usage($options, "{import.params}",1);
199 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
200 die "\n";
201 }
202
203 if ($self->{'verbosity'} > 2) {
204 print "[INFO] This inexport.pm supports version 2 manifest files\n";
205 }
206 if ($self->{'verbosity'} > 3) {
207 print '[DEBUG] Perl @INC: ' . join(", ", @INC) . "\n";
208 }
209
210 my $language = $self->{'language'};
211 # If $language has been specified, load the appropriate resource bundle
212 # (Otherwise, the default resource bundle will be loaded automatically)
213 if ($language && $language =~ /\S/) {
214 &gsprintf::load_language_specific_resource_bundle($language);
215 }
216
217 if ($self->{'listall'}) {
218 if ($self->{'xml'}) {
219 &PrintUsage::print_xml_usage($opt_listall_options);
220 }
221 else
222 {
223 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
224 }
225 die "\n";
226 }
227
228 if ($self->{'xml'}) {
229 &PrintUsage::print_xml_usage($options);
230 print "\n";
231 return bless $self, $class;
232 }
233
234 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
235 &gsprintf::output_strings_in_UTF8;
236 }
237
238 # If the user specified -h, then we output the usage
239 if (@$argv && $argv->[0] =~ /^\-+h/) {
240 &PrintUsage::print_txt_usage($options, "{import.params}");
241 die "\n";
242 }
243 # now check that we had exactly one leftover arg, which should be
244 # the collection name. We don't want to do this earlier, cos
245 # -xml arg doesn't need a collection name
246
247 if ($intArgLeftinAfterParsing != 1 )
248 {
249 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
250 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
251 die "\n";
252 }
253
254 $self->{'close_out'} = 0;
255 my $out = $self->{'out'};
256 if ($out !~ /^(STDERR|STDOUT)$/i) {
257 open (OUT, ">$out") ||
258 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
259 $out = 'inexport::OUT';
260 $self->{'close_out'} = 1;
261 }
262 $out->autoflush(1);
263 $self->{'out'} = $out;
264
265 my $statsfile = $self->{'statsfile'};
266 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
267 open (STATSFILE, ">$statsfile") ||
268 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
269 $statsfile = 'inexport::STATSFILE';
270 $self->{'close_stats'} = 1;
271 }
272 $statsfile->autoflush(1);
273 $self->{'statsfile'} = $statsfile;
274
275 # @ARGV should be only one item, the name of the collection
276 $self->{'collection'} = shift @$argv;
277
278 # Unless otherwise stated all manifests are considered version 1---where
279 # they act more like an advanced process expression---as compared to newer
280 # manifest files that act as an explicit (and exhaustive) list of files to
281 # process [jmt12]
282 $self->{'manifest_version'} = 1;
283
284 return bless $self, $class;
285}
286
287# Simplified version of the contstructor for use with CGI scripts
288sub newCGI
289{
290 my $class = shift (@_);
291 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
292
293 my $self = { 'xml' => 0, 'mode' => $mode };
294
295 $self->{'out'} = STDERR;
296
297 if (defined $gsdl_cgi) {
298 $self->{'site'} = $opt_site;
299 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
300 $self->{'collectdir'} = $collect_dir;
301 }
302 else {
303 $self->{'site'} = "";
304 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
305 }
306 $self->{'faillog'} = "";
307
308 $self->{'collection'} = $collect;
309
310 return bless $self, $class;
311}
312sub get_collection
313{
314 my $self = shift @_;
315
316 return $self->{'collection'};
317}
318
319
320sub read_collection_cfg
321{
322 my $self = shift @_;
323 my ($collection,$options) = @_;
324
325 my $collectdir = $self->{'collectdir'};
326 my $site = $self->{'site'};
327 my $out = $self->{'out'};
328
329 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
330 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
331 die "\n";
332 }
333
334 # set gs_version 2/3
335 $self->{'gs_version'} = "2";
336 if ((defined $site) && ($site ne "")) {
337 # gs3
338 $self->{'gs_version'} = "3";
339 }
340
341 # add collection's perllib dir into include path in
342 # case we have collection specific modules
343 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
344
345 # check that we can open the faillog
346 my $faillog = $self->{'faillog'};
347 if ($faillog eq "") {
348 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
349 }
350 open (FAILLOG, ">$faillog") ||
351 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
352
353
354 my $faillogname = $faillog;
355 $faillog = 'inexport::FAILLOG';
356 $faillog->autoflush(1);
357 $self->{'faillog'} = $faillog;
358 $self->{'faillogname'} = $faillogname;
359 $self->{'close_faillog'} = 1;
360
361 # Read in the collection configuration file.
362 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
363 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
364 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
365
366 return ($config_filename,$collectcfg);
367}
368
369sub set_collection_options
370{
371 my $self = shift @_;
372 my ($collectcfg) = @_;
373
374 my $inexport_mode = $self->{'mode'};
375
376 my $importdir = $self->{'importdir'};
377 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
378 my $out = $self->{'out'};
379
380 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
381 if (!defined($collectcfg->{'infodbtype'}))
382 {
383 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
384 }
385 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
386 # we can't use the text version for archives dbs.
387 $collectcfg->{'infodbtype'} = "gdbm";
388 }
389
390 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
391 $importdir = $collectcfg->{'importdir'};
392 }
393
394 if ($inexport_mode eq "import") {
395 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
396 $archivedir = $collectcfg->{'archivedir'};
397 }
398 }
399 elsif ($inexport_mode eq "export") {
400 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
401 $archivedir = $collectcfg->{'exportdir'};
402 }
403 }
404 # fill in the default import and archives directories if none
405 # were supplied, turn all \ into / and remove trailing /
406 if (!&FileUtils::isFilenameAbsolute($importdir))
407 {
408 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
409 }
410 else
411 {
412 # Don't do this - it kills protocol prefixes
413 #$importdir =~ s/[\\\/]+/\//g;
414 #$importdir =~ s/\/$//;
415 # Do this instead
416 &FileUtils::sanitizePath($importdir);
417 }
418 if (!&FileUtils::directoryExists($importdir))
419 {
420 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
421 die "\n";
422 }
423 $self->{'importdir'} = $importdir;
424
425 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
426 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
427 }
428 else {
429
430 $archivedir = &FileUtils::sanitizePath($archivedir);
431 }
432 $self->{'archivedir'} = $archivedir;
433
434 if (defined $self->{'default_verbosity'}) {
435 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
436 $self->{'verbosity'} = $collectcfg->{'verbosity'};
437 }
438 }
439
440 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
441 $self->{'manifest'} = $collectcfg->{'manifest'};
442 }
443
444 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
445 if ($collectcfg->{'gzip'} =~ /^true$/i) {
446 $self->{'gzip'} = 1;
447 }
448 }
449
450 if (defined $self->{'default_maxdocs'}) {
451 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
452 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
453 }
454 }
455
456
457
458 if (defined $self->{'default_OIDtype'} ) {
459 if (defined $collectcfg->{'OIDtype'}
460 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
461 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
462 }
463 }
464
465 if (defined $self->{'default_OIDmetadata'}) {
466 if (defined $collectcfg->{'OIDmetadata'}) {
467 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
468 }
469 }
470
471 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
472 $self->{'debug'} = 1;
473 }
474 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
475 $self->{'gli'} = 1;
476 }
477 $self->{'gli'} = 0 unless defined $self->{'gli'};
478
479 # check keepold and removeold
480 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
481
482 my ($removeold, $keepold, $incremental, $incremental_mode)
483 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
484 $self->{'incremental'}, $checkdir,
485 $collectcfg);
486
487 $self->{'removeold'} = $removeold;
488 $self->{'keepold'} = $keepold;
489 $self->{'incremental'} = $incremental;
490 $self->{'incremental_mode'} = $incremental_mode;
491
492 # We'll need direct access to this plugin to support v2 manifests
493 $self->{'directoryplugin'} = 0;
494}
495
496sub process_files
497{
498 my $self = shift @_;
499 my ($config_filename,$collectcfg) = @_;
500
501 my $inexport_mode = $self->{'mode'};
502
503 my $verbosity = $self->{'verbosity'};
504 my $debug = $self->{'debug'};
505
506 my $importdir = $self->{'importdir'};
507 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
508
509 my $incremental = $self->{'incremental'};
510 my $incremental_mode = $self->{'incremental_mode'};
511
512 my $gs_version = $self->{'gs_version'};
513
514 my $removeold = $self->{'removeold'};
515 my $keepold = $self->{'keepold'};
516
517 my $saveas = $self->{'saveas'};
518 my $saveas_options = $self->{'saveas_options'};
519 my $OIDtype = $self->{'OIDtype'};
520 my $OIDmetadata = $self->{'OIDmetadata'};
521
522 my $out = $self->{'out'};
523 my $faillog = $self->{'faillog'};
524
525 my $maxdocs = $self->{'maxdocs'};
526 my $gzip = $self->{'gzip'};
527 my $groupsize = $self->{'groupsize'};
528 my $sortmeta = $self->{'sortmeta'};
529
530 my $removeprefix = $self->{'removeprefix'};
531 my $removesuffix = $self->{'removesuffix'};
532
533 my $gli = $self->{'gli'};
534
535 # related to export
536 my $xsltfile = $self->{'xsltfile'};
537 my $group_marc = $self->{'group_marc'};
538 my $mapping_file = $self->{'mapping_file'};
539 my $xslt_mets = $self->{'xslt_mets'};
540 my $xslt_txt = $self->{'xslt_txt'};
541 my $fedora_namespace = $self->{'fedora_namespace'};
542 my $metadata_prefix = $self->{'metadata_prefix'};
543
544 if ($inexport_mode eq "import") {
545 print STDERR "<Import>\n" if $gli;
546 }
547 else {
548 print STDERR "<export>\n" if $gli;
549 }
550
551 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
552 if ($self->{'manifest'} ne "") {
553 my $manifest_filename = $self->{'manifest'};
554
555 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
556 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
557 }
558 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
559 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
560 #$self->{'manifest'} =~ s/\/$//;
561
562 $manifest_lookup->parse($manifest_filename);
563
564 # manifests may now include a version number [jmt12]
565 $self->{'manifest_version'} = $manifest_lookup->get_version();
566 }
567
568 my $manifest = $self->{'manifest'};
569
570 # load all the plugins
571 my $plugins = [];
572 if (defined $collectcfg->{'plugin'}) {
573 $plugins = $collectcfg->{'plugin'};
574 }
575
576 my $plugin_incr_mode = $incremental_mode;
577 if ($manifest ne "") {
578 # if we have a manifest file, then we pretend we are fully incremental for plugins
579 $plugin_incr_mode = "all";
580 }
581 #some global options for the plugins
582 my @global_opts = ();
583
584 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
585 if (scalar(@$pluginfo) == 0) {
586 &gsprintf($out, "{import.no_plugins_loaded}\n");
587 die "\n";
588 }
589 # Store a reference to the DirectoryPlugin
590 foreach my $a_plugin (@{$pluginfo})
591 {
592 if (blessed ($a_plugin) eq 'DirectoryPlugin')
593 {
594 $self->{'directoryplugin'} = $a_plugin;
595 }
596 }
597 # No directory plugin - no v2 manifest support
598 if ($self->{'directoryplugin'} == 0)
599 {
600 print STDERR "WARNING: DirectoryPlugin not loaded: metadata.xml files not supported.\n";
601 }
602
603 # remove the old contents of the archives directory (and tmp
604 # directory) if needed
605
606 if ($removeold) {
607 if (&FileUtils::directoryExists($archivedir)) {
608 &gsprintf($out, "{import.removing_archives}\n");
609 &FileUtils::removeFilesRecursive($archivedir);
610 }
611 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
612 $tmpdir =~ s/[\\\/]+/\//g;
613 $tmpdir =~ s/\/$//;
614 if (&FileUtils::directoryExists($tmpdir)) {
615 &gsprintf($out, "{import.removing_tmpdir}\n");
616 &FileUtils::removeFilesRecursive($tmpdir);
617 }
618 }
619
620 # create the archives dir if needed
621 &FileUtils::makeAllDirectories($archivedir);
622
623 # read the archive information file
624
625 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
626 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
627 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
628
629 # When we make these initial calls to determine the archive information doc
630 # and src databases we pass through a '1' to indicate this is the first
631 # time we are referring to these databases. When using dynamic dbutils
632 # (available in extensions) this indicates to some database types (for
633 # example, persistent servers) that this is a good time to perform any
634 # one time initialization. The argument has no effect on vanilla dbutils
635 # [jmt12]
636 my $perform_firsttime_init = 1;
637 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
638 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
639
640 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
641 $archive_info->load_info ($arcinfo_doc_filename);
642
643 if ($manifest eq "") {
644 # Load in list of files in import folder from last import (if present)
645 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
646 }
647
648 ####Use Plugout####
649 my $plugout;
650
651 my $generate_auxiliary_files = 0;
652 if ($inexport_mode eq "import") {
653 $generate_auxiliary_files = 1;
654 }
655 elsif ($self->{'include_auxiliary_database_files'}) {
656 $generate_auxiliary_files = 1;
657 }
658 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
659
660 # Option to use user defined plugout
661 if ($inexport_mode eq "import") {
662 if (defined $collectcfg->{'plugout'}) {
663 # If a plugout was specified in the collect.cfg file, assume it is sensible
664 # We can't check the name because it could be anything, if it is a custom plugout
665 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
666 $plugout = $collectcfg->{'plugout'};
667 }
668 else {
669 push @$plugout,$saveas."Plugout";
670 }
671
672 }
673 else {
674 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
675 $plugout = $collectcfg->{'plugout'};
676 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
677 }
678 else {
679 push @$plugout,$saveas."Plugout";
680 }
681 }
682
683 my $plugout_name = $plugout->[0];
684
685 if ($inexport_mode eq "export" && defined $saveas_options) {
686 my @user_plugout_options = split(" ", $saveas_options);
687 push @$plugout, @user_plugout_options;
688 }
689 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
690 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
691 push @$plugout,("-debug") if ($debug);
692 push @$plugout,("-gzip_output") if ($gzip);
693 push @$plugout,("-output_handle",$out) if (defined $out);
694
695 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
696 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
697 if ($inexport_mode eq "import") {
698 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
699 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
700 }
701 }
702 my $processor = &plugout::load_plugout($plugout);
703 $processor->setoutputdir ($archivedir);
704 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
705 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
706 $processor->begin();
707 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
708
709 if ($removeold) {
710 # occasionally, plugins may want to do something on remove
711 # old, eg pharos image indexing
712 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
713 }
714
715 # process the import directory
716 my $block_hash = {};
717 $block_hash->{'new_files'} = {};
718 $block_hash->{'reindex_files'} = {};
719 # all of these are set somewhere else, so it's more readable to define them
720 # here [jmt12]
721 $block_hash->{'all_files'} = {};
722 $block_hash->{'deleted_files'} = {};
723 $block_hash->{'file_blocks'} = {};
724 $block_hash->{'metadata_files'} = {};
725 $block_hash->{'shared_fileroot'} = '';
726 # a new flag so we can tell we had a manifest way down in the plugins
727 # [jmt12]
728 $block_hash->{'manifest'} = 'false';
729 my $metadata = {};
730
731 # global blocking pass may set up some metadata
732 # does this set up metadata?????
733 # - when we have a newer manifest file we don't do this -unless- the
734 # collection configuration indicates this collection contains complex
735 # (inherited) metadata [jmt12]
736 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
737 {
738 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
739 }
740 else
741 {
742 print STDERR "Skipping import directory-level global file scan due to manifest and complexmeta configuration\n";
743 }
744
745 if ($manifest ne "") {
746
747 # mark that we are using a manifest - information that might be needed
748 # down in plugins (for instance DirectoryPlugin)
749 $block_hash->{'manifest'} = $self->{'manifest_version'};
750
751 #
752 # 1. Process delete files first
753 #
754 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
755 my @full_deleted_files = ();
756
757 # ensure all filenames are absolute
758 foreach my $df (@deleted_files) {
759 my $full_df =
760 (&FileUtils::isFilenameAbsolute($df))
761 ? $df
762 : &FileUtils::filenameConcatenate($importdir,$df);
763
764 if (-d $full_df && $self->{'manifest_version'} != 2) {
765 &add_dir_contents_to_list($full_df, \@full_deleted_files);
766 } else {
767 push(@full_deleted_files,$full_df);
768 }
769 }
770
771 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
772 mark_docs_for_deletion($archive_info,{},
773 \@full_deleted_files,
774 $archivedir, $verbosity, "delete");
775
776
777 #
778 # 2. Now files for reindexing
779 #
780
781 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
782 my @full_reindex_files = ();
783 # ensure all filenames are absolute
784 foreach my $rf (@reindex_files) {
785 my $full_rf =
786 (&FileUtils::isFilenameAbsolute($rf))
787 ? $rf
788 : &FileUtils::filenameConcatenate($importdir,$rf);
789
790 if (-d $full_rf && $self->{'manifest_version'} != 2) {
791 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
792 } else {
793 push(@full_reindex_files,$full_rf);
794 }
795 }
796
797 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
798 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
799
800 # And now to ensure the new version of the file processed by
801 # appropriate plugin, we need to add it to block_hash reindex list
802 foreach my $full_rf (@full_reindex_files) {
803 $block_hash->{'reindex_files'}->{$full_rf} = 1;
804 }
805
806
807 #
808 # 3. Now finally any new files - add to block_hash new_files list
809 #
810
811 my @new_files = keys %{$manifest_lookup->{'index'}};
812 my @full_new_files = ();
813
814 foreach my $nf (@new_files) {
815 # ensure filename is absolute
816 my $full_nf =
817 (&FileUtils::isFilenameAbsolute($nf))
818 ? $nf
819 : &FileUtils::filenameConcatenate($importdir,$nf);
820
821 if (-d $full_nf && $self->{'manifest_version'} != 2) {
822 &add_dir_contents_to_list($full_nf, \@full_new_files);
823 } else {
824 push(@full_new_files,$full_nf);
825 }
826 }
827
828 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
829 # need to check this file exists before trying to read it - in the past
830 # it wasn't possible to have a manifest unless keepold was also set so
831 # you were pretty much guaranteed arcinfo existed
832 # [jmt12]
833 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
834 if (-e $arcinfo_src_filename)
835 {
836 my $arcinfodb_map = {};
837 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
838 foreach my $f (@full_new_files) {
839 my $rel_f = &util::abspath_to_placeholders($f);
840
841 # check that we haven't seen it already
842 if (defined $arcinfodb_map->{$rel_f}) {
843 # TODO make better warning
844 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
845 } else {
846 $block_hash->{'new_files'}->{$f} = 1;
847 }
848 }
849
850 undef $arcinfodb_map;
851 }
852 # no existing files - so we can just add all the files [jmt12]
853 else
854 {
855 foreach my $f (@full_new_files)
856 {
857 $block_hash->{'new_files'}->{$f} = 1;
858 }
859 }
860
861 # If we are not using complex inherited metadata (and thus have skipped
862 # the global file scan) we need to at least check for a matching
863 # metadata.xml for the files being indexed/reindexed
864 # - unless we are using the newer version of Manifests, which are treated
865 # verbatim, and should have a metadata element for metadata files (so
866 # we can explicitly process metadata files other than metadata.xml)
867 # [jmt12]
868 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
869 {
870 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
871 foreach my $file_to_import (@all_files_to_import)
872 {
873 my $metadata_xml_path = $file_to_import;
874 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
875 if (&FileUtils::fileExists($metadata_xml_path))
876 {
877 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
878 }
879 }
880 }
881
882 # new version manifest files explicitly list files to be processed and
883 # only support 'simplemeta' format (ignoring complexmeta if set) in that
884 # each document can be accompanied by a metadata.xml file in the same
885 # directory. The metadata.xml can only apply to the fileset ".*".
886 # [jmt12]
887 if ($self->{'manifest_version'} > 1)
888 {
889 # Process metadata files
890 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
891 {
892 if (&FileUtils::directoryExists($file_to_import)) {
893# print "DEBUG: Directory to import: \"" . $file_to_import . "\"\n";
894 &plugin::file_block_read($pluginfo, '', $file_to_import, $block_hash, $metadata, $gli);
895# print "\n===== BLOCK HASH =====\n";
896# Dump($block_hash);
897# print "\n===== =====\n\n";
898 $self->perform_process_files($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
899 }
900 else
901 {
902# print "DEBUG: File to import: \"" . $file_to_import . "\"\n";
903 $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli);
904 }
905 }
906 }
907 }
908 else {
909 # if incremental, we read through the import folder to see whats changed.
910
911 if ($incremental || $incremental_mode eq "onlyadd") {
912 prime_doc_oid_count($archivedir);
913
914 # Can now work out which files were new, already existed, and have
915 # been deleted
916
917 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
918 $archivedir,$verbosity,$incremental_mode);
919
920 my @new_files = sort keys %{$block_hash->{'new_files'}};
921 if (scalar(@new_files>0)) {
922 print STDERR "New files and modified metadata files since last import:\n ";
923 print STDERR join("\n ",@new_files), "\n";
924 }
925
926 if ($incremental) {
927 # only look for deletions if we are truely incremental
928 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
929 # Filter out any in gsdl/tmp area
930 my @filtered_deleted_files = ();
931 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
932 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
933 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
934 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
935
936 foreach my $df (@deleted_files) {
937 next if ($df =~ m/^$gsdl_tmp_area/);
938 next if ($df =~ m/^$collect_tmp_area/);
939
940 push(@filtered_deleted_files,$df);
941 }
942
943
944 @deleted_files = @filtered_deleted_files;
945
946 if (scalar(@deleted_files)>0) {
947 print STDERR "Files deleted since last import:\n ";
948 print STDERR join("\n ",@deleted_files), "\n";
949
950
951 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
952
953 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
954 }
955
956 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
957
958 if (scalar(@reindex_files)>0) {
959 print STDERR "Files to reindex since last import:\n ";
960 print STDERR join("\n ",@reindex_files), "\n";
961 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
962 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
963 }
964
965 }
966 }
967 }
968
969 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
970 # Do nothing if the file already exists (file exists on incremental build).
971 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
972 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
973 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
974 # oailastmodified and oailastmodifieddate
975 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
976 if ($self->{'generate_auxiliary_files'}) {
977 if (!-f $earliestDatestampFile && -d $archivedir) {
978 my $current_time_in_seconds = time; # in seconds
979
980 if(open(FOUT, ">$earliestDatestampFile")) {
981 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
982 print FOUT $current_time_in_seconds;
983 close(FOUT);
984 }
985 else {
986 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
987 }
988
989 }
990 }
991
992 if ($self->{'manifest_version'} != 2)
993 {
994 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
995 }
996
997 if ($saveas eq "FedoraMETS") {
998 # create collection "doc obj" for Fedora that contains
999 # collection-level metadata
1000
1001 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1002 $doc_obj->set_OID("collection");
1003
1004 my $col_name = undef;
1005 my $col_meta = $collectcfg->{'collectionmeta'};
1006
1007 if (defined $col_meta) {
1008 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1009 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
1010 }
1011 $processor->process($doc_obj);
1012 }
1013
1014 &plugin::end($pluginfo, $processor);
1015
1016 &plugin::deinit($pluginfo, $processor);
1017
1018 # Store the value of OIDCount (used in doc.pm) so it can be
1019 # restored correctly to this value on an incremental build
1020 # - this OIDcount file should only be generated for numerical oids [jmt12]
1021 if ($self->{'OIDtype'} eq 'incremental')
1022 {
1023 store_doc_oid_count($archivedir);
1024 }
1025
1026 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1027 $processor->close_group_output() if $processor->is_group();
1028
1029# if ($inexport_mode eq "import") {
1030 if ($self->{'generate_auxiliary_files'}) {
1031 # write out the archive information file
1032 # for backwards compatability with archvies.inf file
1033 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1034 $archive_info->save_info($arcinfo_doc_filename);
1035 }
1036 else {
1037 $archive_info->save_revinfo_db($arcinfo_src_filename);
1038 }
1039 }
1040 return $pluginfo;
1041}
1042
1043# @function perform_process_files()
1044# while process_files() above prepares the system to import files this is the
1045# function that actually initiates the plugin pipeline to process the files.
1046# This function the therefore be overridden in subclasses of inexport.pm should
1047# they wish to do different or further processing
1048# @author jmt12
1049sub perform_process_files
1050{
1051 my $self = shift(@_);
1052 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1053 my $gli = $self->{'gli'};
1054 # specific file to process - via manifest version 2+
1055 if ($file_to_import ne '')
1056 {
1057 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1058 }
1059 # global file scan - if we are using a new version manifest, files would have
1060 # been read above. Older manifests use extra settings in the $block_hash to
1061 # control what is imported, while non-manifest imports use a regular
1062 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1063 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1064 {
1065 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1066 }
1067 else
1068 {
1069 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1070 }
1071}
1072# perform_process_files()
1073
1074# @function generate_statistics()
1075sub generate_statistics
1076{
1077 my $self = shift @_;
1078 my ($pluginfo) = @_;
1079
1080 my $inexport_mode = $self->{'mode'};
1081 my $out = $self->{'out'};
1082 my $faillogname = $self->{'faillogname'};
1083 my $statsfile = $self->{'statsfile'};
1084 my $gli = $self->{'gli'};
1085
1086 &gsprintf($out, "\n");
1087 &gsprintf($out, "*********************************************\n");
1088 &gsprintf($out, "{$inexport_mode.complete}\n");
1089 &gsprintf($out, "*********************************************\n");
1090
1091 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1092}
1093# generate_statistics()
1094
1095
1096# @function deinit()
1097# Close down any file handles that we opened (and hence are responsible for
1098# closing
1099sub deinit
1100{
1101 my $self = shift(@_);
1102 close OUT if $self->{'close_out'};
1103 close FAILLOG if $self->{'close_faillog'};
1104 close STATSFILE if $self->{'close_statsfile'};
1105}
1106# deinit()
1107
1108
1109sub store_collectionmeta
1110{
1111 my ($collectionmeta,$field,$doc_obj) = @_;
1112
1113 my $section = $doc_obj->get_top_section();
1114
1115 my $field_hash = $collectionmeta->{$field};
1116
1117 foreach my $k (keys %$field_hash)
1118 {
1119 my $val = $field_hash->{$k};
1120
1121 ### print STDERR "*** $k = $field_hash->{$k}\n";
1122
1123 my $md_label = "ex.$field";
1124
1125
1126 if ($k =~ m/^\[l=(.*?)\]$/)
1127 {
1128
1129 my $md_suffix = $1;
1130 $md_label .= "^$md_suffix";
1131 }
1132
1133
1134 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1135
1136 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1137 # while "collectionname" in GS2 is called "name" in GS3.
1138 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1139 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1140 {
1141 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1142 }
1143
1144 }
1145}
1146
1147
1148sub oid_count_file {
1149 my ($archivedir) = @_;
1150 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1151}
1152
1153
1154sub prime_doc_oid_count
1155{
1156 my ($archivedir) = @_;
1157 my $oid_count_filename = &oid_count_file($archivedir);
1158
1159 if (-e $oid_count_filename) {
1160 if (open(OIDIN,"<$oid_count_filename")) {
1161 my $OIDcount = <OIDIN>;
1162 chomp $OIDcount;
1163 close(OIDIN);
1164
1165 $doc::OIDcount = $OIDcount;
1166 }
1167 else {
1168 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1169 }
1170 }
1171
1172}
1173
1174sub store_doc_oid_count
1175{
1176 # Use the file "OIDcount" in the archives directory to record
1177 # what value doc.pm got up to
1178
1179 my ($archivedir) = @_;
1180 my $oid_count_filename = &oid_count_file($archivedir);
1181
1182 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1183 if (open(OIDOUT,">$oid_count_filename")) {
1184 print OIDOUT $doc::OIDcount, "\n";
1185
1186 close(OIDOUT);
1187 }
1188 else {
1189 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1190 }
1191}
1192
1193
1194
1195sub new_vs_old_import_diff
1196{
1197 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1198
1199 # Get the infodbtype value for this collection from the arcinfo object
1200 my $infodbtype = $archive_info->{'infodbtype'};
1201
1202 # in this method, we want to know if metadata files are modified or not.
1203 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1204
1205 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1206
1207 # First convert all files to absolute form
1208 # This is to support the situation where the import folder is not
1209 # the default
1210
1211 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1212 my $full_prev_all_files = {};
1213
1214 foreach my $prev_file (keys %$prev_all_files) {
1215
1216 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1217 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1218 $full_prev_all_files->{$full_prev_file} = $prev_file;
1219 }
1220 else {
1221 $full_prev_all_files->{$prev_file} = $prev_file;
1222 }
1223 }
1224
1225
1226 # Figure out which are the new files, existing files and so
1227 # by implication the files from the previous import that are not
1228 # there any more => mark them for deletion
1229 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1230
1231 my $full_curr_file = $curr_file;
1232
1233 # entry in 'all_files' is moved to either 'existing_files',
1234 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1235
1236 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1237 # add in import dir to make absolute
1238 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1239 }
1240
1241 # figure out if new file or not
1242 if (defined $full_prev_all_files->{$full_curr_file}) {
1243 # delete it so that only files that need deleting are left
1244 delete $full_prev_all_files->{$full_curr_file};
1245
1246 # had it before. is it a metadata file?
1247 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1248
1249 # is it modified??
1250 if (-M $full_curr_file < $archiveinf_timestamp) {
1251 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1252 # its newer than last build
1253 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1254 }
1255 }
1256 else {
1257 if ($incremental_mode eq "all") {
1258
1259 # had it before
1260 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1261
1262 }
1263 else {
1264 # Warning in "onlyadd" mode, but had it before!
1265 print STDERR "Warning: File $full_curr_file previously imported.\n";
1266 print STDERR " Treating as new file\n";
1267
1268 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1269
1270 }
1271 }
1272 }
1273 else {
1274 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1275 # the new file is the special sort of file greenstone uses
1276 # to attach metadata to src documents
1277 # i.e metadata.xml
1278 # (but note, the filename used is not constrained in
1279 # Greenstone to always be this)
1280
1281 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1282 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1283 }
1284 else {
1285 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1286 }
1287 }
1288
1289
1290 delete $block_hash->{'all_files'}->{$curr_file};
1291 }
1292
1293
1294
1295
1296 # Deal with complication of new or modified metadata files by forcing
1297 # everything from this point down in the file hierarchy to
1298 # be freshly imported.
1299 #
1300 # This may mean files that have not changed are reindexed, but does
1301 # guarantee by the end of processing all new metadata is correctly
1302 # associated with the relevant document(s).
1303
1304 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1305 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1306
1307 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1308 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1309
1310 # Go through existing_files, and mark anything that is contained
1311 # within 'situated_dir' to be reindexed (in case some of the metadata
1312 # attaches to one of these files)
1313
1314 my $reindex_files = [];
1315
1316 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1317
1318 if ($existing_f =~ m/^$situated_dir/) {
1319
1320 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1321
1322 push(@$reindex_files,$existing_f);
1323 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1324 delete $block_hash->{'existing_files'}->{$existing_f};
1325
1326 }
1327 }
1328
1329 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1330 # (or equivalent)
1331 $block_hash->{'new_files'}->{$new_mdf} = 1;
1332
1333 }
1334
1335 # go through remaining existing files and work out what has changed and needs to be reindexed.
1336 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1337
1338 my $reindex_files = [];
1339
1340 foreach my $existing_filename (@existing_files) {
1341 if (-M $existing_filename < $archiveinf_timestamp) {
1342 # file is newer than last build
1343
1344 my $existing_file = $existing_filename;
1345 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1346
1347 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1348 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1349
1350 print STDERR "**** Reindexing existing file: $existing_file\n";
1351
1352 push(@$reindex_files,$existing_file);
1353 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1354 }
1355
1356 }
1357
1358
1359 # By this point full_prev_all_files contains the files
1360 # mentioned in archiveinf-src.db but are not in the 'import'
1361 # folder (or whatever was specified through -importdir ...)
1362
1363 # This list can contain files that were created in the 'tmp' or
1364 # 'cache' areas (such as screen-size and thumbnail images).
1365 #
1366 # In building the final list of files to delete, we test to see if
1367 # it exists on the filesystem and if it does (unusual for a "normal"
1368 # file in import, but possible in the case of 'tmp' files),
1369 # supress it from going into the final list
1370
1371 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1372
1373 my @deleted_files = values %$full_prev_all_files;
1374 map { my $curr_file = $_;
1375 my $full_curr_file = $curr_file;
1376
1377 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1378 # add in import dir to make absolute
1379
1380 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1381 }
1382
1383
1384 if (!-e $full_curr_file) {
1385 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1386 }
1387 } @deleted_files;
1388
1389
1390
1391}
1392
1393
1394# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1395# $mode is 'delete' or 'reindex'
1396sub mark_docs_for_deletion
1397{
1398 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1399
1400 my $mode_text = "deleted from index";
1401 if ($mode eq "reindex") {
1402 $mode_text = "reindexed";
1403 }
1404
1405 # Get the infodbtype value for this collection from the arcinfo object
1406 my $infodbtype = $archive_info->{'infodbtype'};
1407
1408 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1409 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1410
1411
1412 # record files marked for deletion in arcinfo
1413 foreach my $file (@$deleted_files) {
1414 # use 'archiveinf-src' info database file to look up all the OIDs
1415 # that this file is used in (note in most cases, it's just one OID)
1416
1417 my $relfile = &util::abspath_to_placeholders($file);
1418
1419 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $relfile);
1420 my $oids = $src_rec->{'oid'};
1421 my $file_record_deleted = 0;
1422
1423 # delete the src record
1424 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1425 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $relfile);
1426 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1427
1428
1429 foreach my $oid (@$oids) {
1430
1431 # find the source doc (the primary file that becomes this oid)
1432 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1433 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1434 $doc_source_file = &util::placeholders_to_abspath($doc_source_file);
1435
1436 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1437 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1438 }
1439
1440 if ($doc_source_file ne $file) {
1441 # its an associated or metadata file
1442
1443 # mark source doc for reimport as one of its assoc files has changed or deleted
1444 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1445
1446 }
1447 my $curr_status = $archive_info->get_status_info($oid);
1448 if (defined($curr_status) && (($curr_status ne "D"))) {
1449 if ($verbosity>1) {
1450 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1451 }
1452 # mark oid for deletion (it will be deleted or reimported)
1453 $archive_info->set_status_info($oid,"D");
1454 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1455 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1456
1457 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1458 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1459
1460 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1461 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1462 }
1463 }
1464
1465 }
1466
1467 # now go through and check that we haven't marked any primary
1468 # files for reindex (because their associated files have
1469 # changed/deleted) when they have been deleted themselves. only in
1470 # delete mode.
1471
1472 if ($mode eq "delete") {
1473 foreach my $file (@$deleted_files) {
1474 if (defined $block_hash->{'reindex_files'}->{$file}) {
1475 delete $block_hash->{'reindex_files'}->{$file};
1476 }
1477 }
1478 }
1479
1480
1481}
1482
1483sub add_dir_contents_to_list {
1484
1485 my ($dirname, $list) = @_;
1486
1487 # Recur over directory contents.
1488 my (@dir, $subfile);
1489
1490 # find all the files in the directory
1491 if (!opendir (DIR, $dirname)) {
1492 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1493 return -1; # error in processing
1494 }
1495 @dir = readdir (DIR);
1496 closedir (DIR);
1497
1498 for (my $i = 0; $i < scalar(@dir); $i++) {
1499 my $subfile = $dir[$i];
1500 next if ($subfile =~ m/^\.\.?$/);
1501 next if ($subfile =~ /^\.svn$/);
1502 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1503 if (-d $full_file) {
1504 &add_dir_contents_to_list($full_file, $list);
1505 } else {
1506 push (@$list, $full_file);
1507 }
1508 }
1509
1510}
1511
1512
15131;
Note: See TracBrowser for help on using the repository browser.