source: main/trunk/greenstone2/perllib/inexport.pm@ 32565

Last change on this file since 32565 was 32563, checked in by ak19, 5 years ago
  1. Overhaul of GreenstoneSQLPlugs to handle removeold and incremental delete correctly. And now code also automatically handles 'non-incremental delete' (see mention in ArchivesInfPlugin). The new version no longer does lazy loading for getting the sql db connection in the GS SQL Plugin, as now the connection needs to be active since the start of the plugin to run SQL delete statements on remove_old. So the db connection code for the GS SQL plugin has moved back into its init() method. Lots of changes to gssql.pm (and some flow on effects to the GS SQL Plugout) as when database tables exist and need to be created have changed. 2. Undoing most of the changes of changeset 32555 since we're doing incremental delete and removeold differently and in the correct way now when using the GreenstoneSQLPlugs.
  • Property svn:executable set to *
File size: 48.1 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use File::Basename;
50
51my $oidtype_list =
52 [ { 'name' => "hash",
53 'desc' => "{import.OIDtype.hash}" },
54 { 'name' => "hash_on_full_filename",
55 'desc' => "{import.OIDtype.hash_on_full_filename}" },
56 { 'name' => "assigned",
57 'desc' => "{import.OIDtype.assigned}" },
58 { 'name' => "incremental",
59 'desc' => "{import.OIDtype.incremental}" },
60 { 'name' => "filename",
61 'desc' => "{import.OIDtype.filename}" },
62 { 'name' => "dirname",
63 'desc' => "{import.OIDtype.dirname}" },
64 { 'name' => "full_filename",
65 'desc' => "{import.OIDtype.full_filename}" } ];
66
67$inexport::directory_arguments =
68[
69 { 'name' => "importdir",
70 'desc' => "{import.importdir}",
71 'type' => "string",
72 'reqd' => "no",
73 'deft' => "import",
74 'hiddengli' => "yes" },
75 { 'name' => "collectdir",
76 'desc' => "{import.collectdir}",
77 'type' => "string",
78 # parsearg left "" as default
79 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
80 'deft' => "",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83
84];
85$inexport::arguments =
86[
87 # don't set the default to hash - want to allow this to come from
88 # entry in collect.cfg but want to override it here
89 { 'name' => "OIDtype",
90 'desc' => "{import.OIDtype}",
91 'type' => "enum",
92 'list' => $oidtype_list,
93 'deft' => "hash_on_full_filename",
94 'reqd' => "no",
95 'modegli' => "2" },
96 { 'name' => "OIDmetadata",
97 'desc' => "{import.OIDmetadata}",
98 'type' => "string",
99 'deft' => "dc.Identifier",
100 'reqd' => "no",
101 'modegli' => "2" },
102 { 'name' => "site",
103 'desc' => "{import.site}",
104 'type' => "string",
105 'deft' => "",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "manifest",
109 'desc' => "{import.manifest}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "no",
113 'hiddengli' => "yes" } ,
114 { 'name' => "incremental",
115 'desc' => "{import.incremental}",
116 'type' => "flag",
117 'hiddengli' => "yes" },
118 { 'name' => "keepold",
119 'desc' => "{import.keepold}",
120 'type' => "flag",
121 'reqd' => "no",
122 'hiddengli' => "yes" },
123 { 'name' => "removeold",
124 'desc' => "{import.removeold}",
125 'type' => "flag",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "language",
129 'desc' => "{scripts.language}",
130 'type' => "string",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "maxdocs",
134 'desc' => "{import.maxdocs}",
135 'type' => "int",
136 'reqd' => "no",
137 'deft' => "-1",
138 'range' => "-1,",
139 'modegli' => "1" },
140 { 'name' => "debug",
141 'desc' => "{import.debug}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "faillog",
146 'desc' => "{import.faillog}",
147 'type' => "string",
148 # parsearg left "" as default
149 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
150 'deft' => "",
151 'reqd' => "no",
152 'modegli' => "3" },
153 { 'name' => "out",
154 'desc' => "{import.out}",
155 'type' => "string",
156 'deft' => "STDERR",
157 'reqd' => "no",
158 'hiddengli' => "yes" },
159 { 'name' => "statsfile",
160 'desc' => "{import.statsfile}",
161 'type' => "string",
162 'deft' => "STDERR",
163 'reqd' => "no",
164 'hiddengli' => "yes" },
165 { 'name' => "verbosity",
166 'desc' => "{import.verbosity}",
167 'type' => "int",
168 'range' => "0,",
169 'deft' => "2",
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "gli",
173 'desc' => "{scripts.gli}",
174 'type' => "flag",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "xml",
178 'desc' => "{scripts.xml}",
179 'type' => "flag",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182
183];
184
185sub new
186{
187 my $class = shift (@_);
188 my ($mode,$argv,$options,$opt_listall_options) = @_;
189
190 my $self = { 'xml' => 0, 'mode' => $mode };
191
192 # general options available to all plugins
193 my $arguments = $options->{'args'};
194 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
195 # Parse returns -1 if something has gone wrong
196 if ($intArgLeftinAfterParsing == -1)
197 {
198 &PrintUsage::print_txt_usage($options, "{import.params}",1);
199 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
200 die "\n";
201 }
202
203 my $language = $self->{'language'};
204 # If $language has been specified, load the appropriate resource bundle
205 # (Otherwise, the default resource bundle will be loaded automatically)
206 if ($language && $language =~ /\S/) {
207 &gsprintf::load_language_specific_resource_bundle($language);
208 }
209
210 if ($self->{'listall'}) {
211 if ($self->{'xml'}) {
212 &PrintUsage::print_xml_usage($opt_listall_options);
213 }
214 else
215 {
216 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
217 }
218 die "\n";
219 }
220
221 if ($self->{'xml'}) {
222 &PrintUsage::print_xml_usage($options);
223 print "\n";
224 return bless $self, $class;
225 }
226
227 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
228 &gsprintf::output_strings_in_UTF8;
229 }
230
231 # If the user specified -h, then we output the usage
232 if (@$argv && $argv->[0] =~ /^\-+h/) {
233 &PrintUsage::print_txt_usage($options, "{import.params}");
234 die "\n";
235 }
236 # now check that we had exactly one leftover arg, which should be
237 # the collection name. We don't want to do this earlier, cos
238 # -xml arg doesn't need a collection name
239
240 if ($intArgLeftinAfterParsing != 1 )
241 {
242 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
243 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
244 die "\n";
245 }
246
247 $self->{'close_out'} = 0;
248 my $out = $self->{'out'};
249 if ($out !~ /^(STDERR|STDOUT)$/i) {
250 open (OUT, ">$out") ||
251 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
252 $out = 'inexport::OUT';
253 $self->{'close_out'} = 1;
254 }
255 $out->autoflush(1);
256 $self->{'out'} = $out;
257
258 my $statsfile = $self->{'statsfile'};
259 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
260 open (STATSFILE, ">$statsfile") ||
261 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
262 $statsfile = 'inexport::STATSFILE';
263 $self->{'close_stats'} = 1;
264 }
265 $statsfile->autoflush(1);
266 $self->{'statsfile'} = $statsfile;
267
268 # @ARGV should be only one item, the name of the collection
269 $self->{'collection'} = shift @$argv;
270
271 # Unless otherwise stated all manifests are considered version 1---where
272 # they act more like an advanced process expression---as compared to newer
273 # manifest files that act as an explicit (and exhaustive) list of files to
274 # process [jmt12]
275 $self->{'manifest_version'} = 1;
276
277 return bless $self, $class;
278}
279
280# Simplified version of the contstructor for use with CGI scripts
281sub newCGI
282{
283 my $class = shift (@_);
284 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
285
286 my $self = { 'xml' => 0, 'mode' => $mode };
287
288 $self->{'out'} = STDERR;
289
290 if (defined $gsdl_cgi) {
291 $self->{'site'} = $opt_site;
292 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
293 $self->{'collectdir'} = $collect_dir;
294 }
295 else {
296 $self->{'site'} = "";
297 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
298 }
299 $self->{'faillog'} = "";
300
301 $self->{'collection'} = $collect;
302
303 return bless $self, $class;
304}
305sub get_collection
306{
307 my $self = shift @_;
308
309 return $self->{'collection'};
310}
311
312
313sub read_collection_cfg
314{
315 my $self = shift @_;
316 my ($collection,$options) = @_;
317
318 my $collectdir = $self->{'collectdir'};
319 my $site = $self->{'site'};
320 my $out = $self->{'out'};
321
322 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
323 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
324 die "\n";
325 }
326
327 # set gs_version 2/3
328 $self->{'gs_version'} = "2";
329 if ((defined $site) && ($site ne "")) {
330 # gs3
331 $self->{'gs_version'} = "3";
332 }
333
334 # add collection's perllib dir into include path in
335 # case we have collection specific modules
336 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
337
338 # check that we can open the faillog
339 my $faillog = $self->{'faillog'};
340 if ($faillog eq "") {
341 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
342 }
343 open (FAILLOG, ">$faillog") ||
344 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
345
346
347 my $faillogname = $faillog;
348 $faillog = 'inexport::FAILLOG';
349 $faillog->autoflush(1);
350 $self->{'faillog'} = $faillog;
351 $self->{'faillogname'} = $faillogname;
352 $self->{'close_faillog'} = 1;
353
354 # Read in the collection configuration file.
355 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
356 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
357
358 # store the config file's name, so oaiinfo object constructor can be instantiated with it
359 $self->{'config_filename'} = $config_filename;
360
361 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
362
363 return ($config_filename,$collectcfg);
364}
365
366sub set_collection_options
367{
368 my $self = shift @_;
369 my ($collectcfg) = @_;
370
371 my $inexport_mode = $self->{'mode'};
372
373 my $importdir = $self->{'importdir'};
374 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
375 my $out = $self->{'out'};
376
377 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
378 if (!defined($collectcfg->{'infodbtype'}))
379 {
380 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
381 }
382 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
383 # we can't use the text version for archives dbs.
384 $collectcfg->{'infodbtype'} = "gdbm";
385 }
386
387 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
388 $importdir = $collectcfg->{'importdir'};
389 }
390
391 if ($inexport_mode eq "import") {
392 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
393 $archivedir = $collectcfg->{'archivedir'};
394 }
395 }
396 elsif ($inexport_mode eq "export") {
397 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
398 $archivedir = $collectcfg->{'exportdir'};
399 }
400 }
401 # fill in the default import and archives directories if none
402 # were supplied, turn all \ into / and remove trailing /
403 if (!&FileUtils::isFilenameAbsolute($importdir))
404 {
405 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
406 }
407 else
408 {
409 # Don't do this - it kills protocol prefixes
410 #$importdir =~ s/[\\\/]+/\//g;
411 #$importdir =~ s/\/$//;
412 # Do this instead
413 &FileUtils::sanitizePath($importdir);
414 }
415 if (!&FileUtils::directoryExists($importdir))
416 {
417 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
418 die "\n";
419 }
420 $self->{'importdir'} = $importdir;
421
422 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
423 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
424 }
425 else {
426
427 $archivedir = &FileUtils::sanitizePath($archivedir);
428 }
429 $self->{'archivedir'} = $archivedir;
430
431 if (defined $self->{'default_verbosity'}) {
432 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
433 $self->{'verbosity'} = $collectcfg->{'verbosity'};
434 }
435 }
436
437 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
438 $self->{'manifest'} = $collectcfg->{'manifest'};
439 }
440
441 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
442 if ($collectcfg->{'gzip'} =~ /^true$/i) {
443 $self->{'gzip'} = 1;
444 }
445 }
446
447 if (defined $self->{'default_maxdocs'}) {
448 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
449 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
450 }
451 }
452
453
454
455 if (defined $self->{'default_OIDtype'} ) {
456 if (defined $collectcfg->{'OIDtype'}
457 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
458 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
459 }
460 }
461
462 if (defined $self->{'default_OIDmetadata'}) {
463 if (defined $collectcfg->{'OIDmetadata'}) {
464 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
465 }
466 }
467
468 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
469 $self->{'debug'} = 1;
470 }
471 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
472 $self->{'gli'} = 1;
473 }
474 $self->{'gli'} = 0 unless defined $self->{'gli'};
475
476 # check keepold and removeold
477 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
478
479 my ($removeold, $keepold, $incremental, $incremental_mode)
480 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
481 $self->{'incremental'}, $checkdir,
482 $collectcfg);
483
484 $self->{'removeold'} = $removeold;
485 $self->{'keepold'} = $keepold;
486 $self->{'incremental'} = $incremental;
487 $self->{'incremental_mode'} = $incremental_mode;
488
489 # Since this wasted my morning, let's at least warn a user that manifest
490 # files now *only* work if keepold is set [jmt12]
491 if ($self->{'manifest'} && (!$keepold || !$incremental))
492 {
493 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
494 }
495 }
496
497sub process_files
498{
499 my $self = shift @_;
500 my ($config_filename,$collectcfg) = @_;
501
502 my $inexport_mode = $self->{'mode'};
503
504 my $verbosity = $self->{'verbosity'};
505 my $debug = $self->{'debug'};
506
507 my $importdir = $self->{'importdir'};
508 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
509
510 my $incremental = $self->{'incremental'};
511 my $incremental_mode = $self->{'incremental_mode'};
512
513 my $gs_version = $self->{'gs_version'};
514
515 my $removeold = $self->{'removeold'};
516 my $keepold = $self->{'keepold'};
517
518 my $saveas = $self->{'saveas'};
519 my $saveas_options = $self->{'saveas_options'};
520 my $OIDtype = $self->{'OIDtype'};
521 my $OIDmetadata = $self->{'OIDmetadata'};
522
523 my $out = $self->{'out'};
524 my $faillog = $self->{'faillog'};
525
526 my $maxdocs = $self->{'maxdocs'};
527 my $gzip = $self->{'gzip'};
528 my $groupsize = $self->{'groupsize'};
529 my $sortmeta = $self->{'sortmeta'};
530
531 my $removeprefix = $self->{'removeprefix'};
532 my $removesuffix = $self->{'removesuffix'};
533
534 my $gli = $self->{'gli'};
535
536 # related to export
537 my $xsltfile = $self->{'xsltfile'};
538 my $group_marc = $self->{'group_marc'};
539 my $mapping_file = $self->{'mapping_file'};
540 my $xslt_mets = $self->{'xslt_mets'};
541 my $xslt_txt = $self->{'xslt_txt'};
542 my $fedora_namespace = $self->{'fedora_namespace'};
543 my $metadata_prefix = $self->{'metadata_prefix'};
544
545 if ($inexport_mode eq "import") {
546 print STDERR "<Import>\n" if $gli;
547 }
548 else {
549 print STDERR "<export>\n" if $gli;
550 }
551
552 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
553 if ($self->{'manifest'} ne "") {
554 my $manifest_filename = $self->{'manifest'};
555
556 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
557 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
558 }
559 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
560 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
561 #$self->{'manifest'} =~ s/\/$//;
562
563 $manifest_lookup->parse($manifest_filename);
564
565 # manifests may now include a version number [jmt12]
566 $self->{'manifest_version'} = $manifest_lookup->get_version();
567 }
568
569 my $manifest = $self->{'manifest'};
570
571 # load all the plugins
572 my $plugins = [];
573 if (defined $collectcfg->{'plugin'}) {
574 $plugins = $collectcfg->{'plugin'};
575 }
576
577 my $plugin_incr_mode = $incremental_mode;
578 if ($manifest ne "") {
579 # if we have a manifest file, then we pretend we are fully incremental for plugins
580 $plugin_incr_mode = "all";
581 }
582 #some global options for the plugins
583 my @global_opts = ();
584
585 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
586 if (scalar(@$pluginfo) == 0) {
587 &gsprintf($out, "{import.no_plugins_loaded}\n");
588 die "\n";
589 }
590
591 # remove the old contents of the archives directory (and tmp
592 # directory) if needed
593
594 if ($removeold) {
595 if (&FileUtils::directoryExists($archivedir)) {
596 &gsprintf($out, "{import.removing_archives}\n");
597 &FileUtils::removeFilesRecursive($archivedir);
598 }
599 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
600 $tmpdir =~ s/[\\\/]+/\//g;
601 $tmpdir =~ s/\/$//;
602 if (&FileUtils::directoryExists($tmpdir)) {
603 &gsprintf($out, "{import.removing_tmpdir}\n");
604 &FileUtils::removeFilesRecursive($tmpdir);
605 }
606 }
607
608 # create the archives dir if needed
609 &FileUtils::makeAllDirectories($archivedir);
610
611 # read the archive information file
612
613 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
614 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
615 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
616
617 # When we make these initial calls to determine the archive information doc
618 # and src databases we pass through a '1' to indicate this is the first
619 # time we are referring to these databases. When using dynamic dbutils
620 # (available in extensions) this indicates to some database types (for
621 # example, persistent servers) that this is a good time to perform any
622 # one time initialization. The argument has no effect on vanilla dbutils
623 # [jmt12]
624 my $perform_firsttime_init = 1;
625 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
626 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
627
628 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
629 $archive_info->load_info ($arcinfo_doc_filename);
630
631 if ($manifest eq "") {
632 # Load in list of files in import folder from last import (if present)
633 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
634 }
635
636 ####Use Plugout####
637 my $plugout;
638
639 my $generate_auxiliary_files = 0;
640 if ($inexport_mode eq "import") {
641 $generate_auxiliary_files = 1;
642 }
643 elsif ($self->{'include_auxiliary_database_files'}) {
644 $generate_auxiliary_files = 1;
645 }
646 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
647
648 # Option to use user defined plugout
649 if ($inexport_mode eq "import") {
650 if (defined $collectcfg->{'plugout'}) {
651 # If a plugout was specified in the collect.cfg file, assume it is sensible
652 # We can't check the name because it could be anything, if it is a custom plugout
653 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
654 $plugout = $collectcfg->{'plugout'};
655 }
656 else {
657 push @$plugout,$saveas."Plugout";
658 }
659
660 }
661 else {
662 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
663 $plugout = $collectcfg->{'plugout'};
664 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
665 }
666 else {
667 push @$plugout,$saveas."Plugout";
668 }
669 }
670
671 my $plugout_name = $plugout->[0];
672
673 if (defined $saveas_options) {
674 my @user_plugout_options = split(" ", $saveas_options);
675 push @$plugout, @user_plugout_options;
676 }
677 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
678 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
679 push @$plugout,("-debug") if ($debug);
680 push @$plugout,("-gzip_output") if ($gzip);
681 push @$plugout,("-output_handle",$out) if (defined $out);
682 push @$plugout,("-site_name",$self->{'site'}) if (defined $self->{'site'});
683
684 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
685 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
686 if ($inexport_mode eq "import") {
687 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
688 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
689 }
690 }
691 my $processor = &plugout::load_plugout($plugout);
692 $processor->setoutputdir ($archivedir);
693 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
694 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
695 $processor->begin();
696 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
697
698 if ($removeold) {
699 # occasionally, plugins may want to do something on remove
700 # old, eg pharos image indexing
701 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
702 }
703
704 # process the import directory
705 my $block_hash = {};
706 $block_hash->{'new_files'} = {};
707 $block_hash->{'reindex_files'} = {};
708 # all of these are set somewhere else, so it's more readable to define them
709 # here [jmt12]
710 $block_hash->{'all_files'} = {};
711 $block_hash->{'deleted_files'} = {};
712 $block_hash->{'file_blocks'} = {};
713 $block_hash->{'metadata_files'} = {};
714 $block_hash->{'shared_fileroot'} = '';
715 # a new flag so we can tell we had a manifest way down in the plugins
716 # [jmt12]
717 $block_hash->{'manifest'} = 'false';
718 my $metadata = {};
719
720 # global blocking pass may set up some metadata
721 # does this set up metadata?????
722 # - when we have a newer manifest file we don't do this -unless- the
723 # collection configuration indicates this collection contains complex
724 # (inherited) metadata [jmt12]
725 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
726 {
727 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
728 }
729 else
730 {
731 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
732 }
733
734
735 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
736 # of the OAI identifiers with their time stamps and deleted status.
737 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
738 my $have_manifest = ($manifest eq '') ? 0 : 1;
739 $oai_info->import_stage($removeold, $have_manifest);
740
741
742 if ($manifest ne "") {
743
744 # mark that we are using a manifest - information that might be needed
745 # down in plugins (for instance DirectoryPlugin)
746 $block_hash->{'manifest'} = $self->{'manifest_version'};
747
748 #
749 # 1. Process delete files first
750 #
751 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
752 my @full_deleted_files = ();
753
754 # ensure all filenames are absolute
755 foreach my $df (@deleted_files) {
756 my $full_df =
757 (&FileUtils::isFilenameAbsolute($df))
758 ? $df
759 : &FileUtils::filenameConcatenate($importdir,$df);
760
761 if (-d $full_df) {
762 &add_dir_contents_to_list($full_df, \@full_deleted_files);
763 } else {
764 push(@full_deleted_files,$full_df);
765 }
766 }
767
768 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
769 mark_docs_for_deletion($archive_info,{},
770 \@full_deleted_files,
771 $archivedir, $verbosity, "delete");
772
773
774 #
775 # 2. Now files for reindexing
776 #
777
778 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
779 my @full_reindex_files = ();
780 # ensure all filenames are absolute
781 foreach my $rf (@reindex_files) {
782 my $full_rf =
783 (&FileUtils::isFilenameAbsolute($rf))
784 ? $rf
785 : &FileUtils::filenameConcatenate($importdir,$rf);
786
787 if (-d $full_rf) {
788 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
789 } else {
790 push(@full_reindex_files,$full_rf);
791 }
792 }
793
794 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
795 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
796
797 # And now to ensure the new version of the file processed by
798 # appropriate plugin, we need to add it to block_hash reindex list
799 foreach my $full_rf (@full_reindex_files) {
800 $block_hash->{'reindex_files'}->{$full_rf} = 1;
801 }
802
803
804 #
805 # 3. Now finally any new files - add to block_hash new_files list
806 #
807
808 my @new_files = keys %{$manifest_lookup->{'index'}};
809 my @full_new_files = ();
810
811 foreach my $nf (@new_files) {
812 # ensure filename is absolute
813 my $full_nf =
814 (&FileUtils::isFilenameAbsolute($nf))
815 ? $nf
816 : &FileUtils::filenameConcatenate($importdir,$nf);
817
818 if (-d $full_nf) {
819 &add_dir_contents_to_list($full_nf, \@full_new_files);
820 } else {
821 push(@full_new_files,$full_nf);
822 }
823 }
824
825 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
826 # need to check this file exists before trying to read it - in the past
827 # it wasn't possible to have a manifest unless keepold was also set so
828 # you were pretty much guaranteed arcinfo existed
829 # [jmt12]
830 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
831 if (-e $arcinfo_src_filename)
832 {
833 my $arcinfodb_map = {};
834 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
835 foreach my $f (@full_new_files) {
836 my $rel_f = &util::abspath_to_placeholders($f);
837
838 # check that we haven't seen it already
839 if (defined $arcinfodb_map->{$rel_f}) {
840 # TODO make better warning
841 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
842 } else {
843 $block_hash->{'new_files'}->{$f} = 1;
844 }
845 }
846
847 undef $arcinfodb_map;
848 }
849 # no existing files - so we can just add all the files [jmt12]
850 else
851 {
852 foreach my $f (@full_new_files)
853 {
854 $block_hash->{'new_files'}->{$f} = 1;
855 }
856 }
857
858 # If we are not using complex inherited metadata (and thus have skipped
859 # the global file scan) we need to at least check for a matching
860 # metadata.xml for the files being indexed/reindexed
861 # - unless we are using the newer version of Manifests, which are treated
862 # verbatim, and should have a metadata element for metadata files (so
863 # we can explicitly process metadata files other than metadata.xml)
864 # [jmt12]
865 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
866 {
867 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
868 foreach my $file_to_import (@all_files_to_import)
869 {
870 my $metadata_xml_path = $file_to_import;
871 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
872 if (&FileUtils::fileExists($metadata_xml_path))
873 {
874 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
875 }
876 }
877 }
878
879 # new version manifest files explicitly list metadata files to be
880 # processed (ignoring complexmeta if set)
881 # [jmt12]
882 if ($self->{'manifest_version'} > 1)
883 {
884 # Process metadata files
885 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
886 {
887 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
888 }
889 }
890 } # end if (manifest ne "")
891 else {
892 # if incremental, we read through the import folder to see whats changed.
893
894 if ($incremental || $incremental_mode eq "onlyadd") {
895 prime_doc_oid_count($archivedir);
896
897 # Can now work out which files were new, already existed, and have
898 # been deleted
899
900 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
901 $archivedir,$verbosity,$incremental_mode);
902
903 my @new_files = sort keys %{$block_hash->{'new_files'}};
904 if (scalar(@new_files>0)) {
905 print STDERR "New files and modified metadata files since last import:\n ";
906 print STDERR join("\n ",@new_files), "\n";
907 }
908
909 if ($incremental) {
910 # only look for deletions if we are truely incremental
911 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
912 # Filter out any in gsdl/tmp area
913 my @filtered_deleted_files = ();
914 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
915 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
916 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
917 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
918
919 foreach my $df (@deleted_files) {
920 next if ($df =~ m/^$gsdl_tmp_area/);
921 next if ($df =~ m/^$collect_tmp_area/);
922
923 push(@filtered_deleted_files,$df);
924 }
925
926
927 @deleted_files = @filtered_deleted_files;
928
929 if (scalar(@deleted_files)>0) {
930 print STDERR "Files deleted since last import:\n ";
931 print STDERR join("\n ",@deleted_files), "\n";
932
933
934 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
935
936 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
937 }
938
939 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
940
941 if (scalar(@reindex_files)>0) {
942 print STDERR "Files to reindex since last import:\n ";
943 print STDERR join("\n ",@reindex_files), "\n";
944 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
945 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
946 }
947
948 }
949 } # end if incremental/only_add mode
950 # else no manifest AND not incremental
951 } # end if else block of manifest ne "" else eq ""
952
953 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
954 # Do nothing if the file already exists (file exists on incremental build).
955 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
956 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
957 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
958 # oailastmodified and oailastmodifieddate
959 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
960 if ($self->{'generate_auxiliary_files'}) {
961 if (!-f $earliestDatestampFile && -d $archivedir) {
962 my $current_time_in_seconds = time; # in seconds
963
964 if(open(FOUT, ">$earliestDatestampFile")) {
965 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
966 print FOUT $current_time_in_seconds;
967 close(FOUT);
968 }
969 else {
970 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
971 }
972
973 }
974 }
975
976 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
977
978 if ($saveas eq "FedoraMETS") {
979 # create collection "doc obj" for Fedora that contains
980 # collection-level metadata
981
982 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
983 $doc_obj->set_OID("collection");
984
985 my $col_name = undef;
986 my $col_meta = $collectcfg->{'collectionmeta'};
987
988 if (defined $col_meta) {
989 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
990 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
991 }
992 $processor->process($doc_obj);
993 }
994
995 &plugin::end($pluginfo, $processor);
996
997 &plugin::deinit($pluginfo, $processor);
998
999 # Store the value of OIDCount (used in doc.pm) so it can be
1000 # restored correctly to this value on an incremental build
1001 # - this OIDcount file should only be generated for numerical oids [jmt12]
1002 if ($self->{'OIDtype'} eq 'incremental')
1003 {
1004 store_doc_oid_count($archivedir);
1005 }
1006
1007 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1008 $processor->close_group_output() if $processor->is_group();
1009 $processor->end();
1010
1011# if ($inexport_mode eq "import") {
1012 if ($self->{'generate_auxiliary_files'}) {
1013 # write out the archive information file
1014 # for backwards compatability with archvies.inf file
1015 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1016 $archive_info->save_info($arcinfo_doc_filename);
1017 }
1018 else {
1019 $archive_info->save_revinfo_db($arcinfo_src_filename);
1020 }
1021 }
1022 return $pluginfo;
1023}
1024
1025# @function perform_process_files()
1026# while process_files() above prepares the system to import files this is the
1027# function that actually initiates the plugin pipeline to process the files.
1028# This function should therefore be overridden in subclasses of inexport.pm should
1029# they wish to do different or further processing
1030# @author jmt12
1031sub perform_process_files
1032{
1033 my $self = shift(@_);
1034 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1035 my $gli = $self->{'gli'};
1036 # specific file to process - via manifest version 2+
1037 if ($file_to_import ne '')
1038 {
1039 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1040 }
1041 # global file scan - if we are using a new version manifest, files would have
1042 # been read above. Older manifests use extra settings in the $block_hash to
1043 # control what is imported, while non-manifest imports use a regular
1044 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1045 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1046 {
1047 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1048 }
1049 else
1050 {
1051 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1052 }
1053}
1054# perform_process_files()
1055
1056# @function generate_statistics()
1057sub generate_statistics
1058{
1059 my $self = shift @_;
1060 my ($pluginfo) = @_;
1061
1062 my $inexport_mode = $self->{'mode'};
1063 my $out = $self->{'out'};
1064 my $faillogname = $self->{'faillogname'};
1065 my $statsfile = $self->{'statsfile'};
1066 my $gli = $self->{'gli'};
1067
1068 &gsprintf($out, "\n");
1069 &gsprintf($out, "*********************************************\n");
1070 &gsprintf($out, "{$inexport_mode.complete}\n");
1071 &gsprintf($out, "*********************************************\n");
1072
1073 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1074}
1075# generate_statistics()
1076
1077
1078# @function deinit()
1079# Close down any file handles that we opened (and hence are responsible for
1080# closing
1081sub deinit
1082{
1083 my $self = shift(@_);
1084 close OUT if $self->{'close_out'};
1085 close FAILLOG if $self->{'close_faillog'};
1086 close STATSFILE if $self->{'close_statsfile'};
1087}
1088# deinit()
1089
1090
1091sub store_collectionmeta
1092{
1093 my ($collectionmeta,$field,$doc_obj) = @_;
1094
1095 my $section = $doc_obj->get_top_section();
1096
1097 my $field_hash = $collectionmeta->{$field};
1098
1099 foreach my $k (keys %$field_hash)
1100 {
1101 my $val = $field_hash->{$k};
1102
1103 ### print STDERR "*** $k = $field_hash->{$k}\n";
1104
1105 my $md_label = "ex.$field";
1106
1107
1108 if ($k =~ m/^\[l=(.*?)\]$/)
1109 {
1110
1111 my $md_suffix = $1;
1112 $md_label .= "^$md_suffix";
1113 }
1114
1115
1116 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1117
1118 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1119 # while "collectionname" in GS2 is called "name" in GS3.
1120 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1121 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1122 {
1123 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1124 }
1125
1126 }
1127}
1128
1129
1130sub oid_count_file {
1131 my ($archivedir) = @_;
1132 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1133}
1134
1135
1136sub prime_doc_oid_count
1137{
1138 my ($archivedir) = @_;
1139 my $oid_count_filename = &oid_count_file($archivedir);
1140
1141 if (-e $oid_count_filename) {
1142 if (open(OIDIN,"<$oid_count_filename")) {
1143 my $OIDcount = <OIDIN>;
1144 chomp $OIDcount;
1145 close(OIDIN);
1146
1147 $doc::OIDcount = $OIDcount;
1148 }
1149 else {
1150 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1151 }
1152 }
1153
1154}
1155
1156sub store_doc_oid_count
1157{
1158 # Use the file "OIDcount" in the archives directory to record
1159 # what value doc.pm got up to
1160
1161 my ($archivedir) = @_;
1162 my $oid_count_filename = &oid_count_file($archivedir);
1163
1164 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1165 if (open(OIDOUT,">$oid_count_filename")) {
1166 print OIDOUT $doc::OIDcount, "\n";
1167
1168 close(OIDOUT);
1169 }
1170 else {
1171 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1172 }
1173}
1174
1175
1176
1177sub new_vs_old_import_diff
1178{
1179 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1180
1181 # Get the infodbtype value for this collection from the arcinfo object
1182 my $infodbtype = $archive_info->{'infodbtype'};
1183
1184 # in this method, we want to know if metadata files are modified or not.
1185 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1186
1187 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1188
1189 # First convert all files to absolute form
1190 # This is to support the situation where the import folder is not
1191 # the default
1192
1193 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1194 my $full_prev_all_files = {};
1195
1196 foreach my $prev_file (keys %$prev_all_files) {
1197
1198 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1199 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1200 $full_prev_all_files->{$full_prev_file} = $prev_file;
1201 }
1202 else {
1203 $full_prev_all_files->{$prev_file} = $prev_file;
1204 }
1205 }
1206
1207
1208 # Figure out which are the new files, existing files and so
1209 # by implication the files from the previous import that are not
1210 # there any more => mark them for deletion
1211 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1212
1213 my $full_curr_file = $curr_file;
1214
1215 # entry in 'all_files' is moved to either 'existing_files',
1216 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1217
1218 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1219 # add in import dir to make absolute
1220 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1221 }
1222
1223 # figure out if new file or not
1224 if (defined $full_prev_all_files->{$full_curr_file}) {
1225 # delete it so that only files that need deleting are left
1226 delete $full_prev_all_files->{$full_curr_file};
1227
1228 # had it before. is it a metadata file?
1229 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1230
1231 # is it modified??
1232 if (-M $full_curr_file < $archiveinf_timestamp) {
1233 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1234 # its newer than last build
1235 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1236 }
1237 }
1238 else {
1239 if ($incremental_mode eq "all") {
1240
1241 # had it before
1242 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1243
1244 }
1245 else {
1246 # Warning in "onlyadd" mode, but had it before!
1247 print STDERR "Warning: File $full_curr_file previously imported.\n";
1248 print STDERR " Treating as new file\n";
1249
1250 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1251
1252 }
1253 }
1254 }
1255 else {
1256 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1257 # the new file is the special sort of file greenstone uses
1258 # to attach metadata to src documents
1259 # i.e metadata.xml
1260 # (but note, the filename used is not constrained in
1261 # Greenstone to always be this)
1262
1263 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1264 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1265 }
1266 else {
1267 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1268 }
1269 }
1270
1271
1272 delete $block_hash->{'all_files'}->{$curr_file};
1273 }
1274
1275
1276
1277
1278 # Deal with complication of new or modified metadata files by forcing
1279 # everything from this point down in the file hierarchy to
1280 # be freshly imported.
1281 #
1282 # This may mean files that have not changed are reindexed, but does
1283 # guarantee by the end of processing all new metadata is correctly
1284 # associated with the relevant document(s).
1285
1286 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1287 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1288
1289 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1290 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1291
1292 # Go through existing_files, and mark anything that is contained
1293 # within 'situated_dir' to be reindexed (in case some of the metadata
1294 # attaches to one of these files)
1295
1296 my $reindex_files = [];
1297
1298 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1299
1300 if ($existing_f =~ m/^$situated_dir/) {
1301
1302# print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1303
1304 push(@$reindex_files,$existing_f);
1305 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1306 delete $block_hash->{'existing_files'}->{$existing_f};
1307
1308 }
1309 }
1310
1311 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1312 # (or equivalent)
1313 $block_hash->{'new_files'}->{$new_mdf} = 1;
1314
1315 }
1316
1317 # go through remaining existing files and work out what has changed and needs to be reindexed.
1318 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1319
1320 my $reindex_files = [];
1321
1322 foreach my $existing_filename (@existing_files) {
1323 if (-M $existing_filename < $archiveinf_timestamp) {
1324 # file is newer than last build
1325
1326 my $existing_file = $existing_filename;
1327 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1328
1329 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1330 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1331
1332 print STDERR "**** Reindexing existing file: $existing_file\n";
1333
1334 push(@$reindex_files,$existing_file);
1335 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1336 }
1337
1338 }
1339
1340
1341 # By this point full_prev_all_files contains the files
1342 # mentioned in archiveinf-src.db but are not in the 'import'
1343 # folder (or whatever was specified through -importdir ...)
1344
1345 # This list can contain files that were created in the 'tmp' or
1346 # 'cache' areas (such as screen-size and thumbnail images).
1347 #
1348 # In building the final list of files to delete, we test to see if
1349 # it exists on the filesystem and if it does (unusual for a "normal"
1350 # file in import, but possible in the case of 'tmp' files),
1351 # supress it from going into the final list
1352
1353 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1354
1355 my @deleted_files = values %$full_prev_all_files;
1356 map { my $curr_file = $_;
1357 my $full_curr_file = $curr_file;
1358
1359 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1360 # add in import dir to make absolute
1361
1362 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1363 }
1364
1365
1366 if (!-e $full_curr_file) {
1367 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1368 }
1369 } @deleted_files;
1370
1371
1372
1373}
1374
1375
1376# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1377# $mode is 'delete' or 'reindex'
1378sub mark_docs_for_deletion
1379{
1380 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1381
1382 my $mode_text = "deleted from index";
1383 if ($mode eq "reindex") {
1384 $mode_text = "reindexed";
1385 }
1386
1387 # Get the infodbtype value for this collection from the arcinfo object
1388 my $infodbtype = $archive_info->{'infodbtype'};
1389
1390 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1391 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1392
1393
1394 # record files marked for deletion in arcinfo
1395 foreach my $file (@$deleted_files) {
1396 # use 'archiveinf-src' info database file to look up all the OIDs
1397 # that this file is used in (note in most cases, it's just one OID)
1398
1399 my $relfile = &util::abspath_to_placeholders($file);
1400
1401 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $relfile);
1402 my $oids = $src_rec->{'oid'};
1403 my $file_record_deleted = 0;
1404
1405 # delete the src record
1406 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1407 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $relfile);
1408 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1409
1410
1411 foreach my $oid (@$oids) {
1412
1413 # find the source doc (the primary file that becomes this oid)
1414 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1415 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1416 $doc_source_file = &util::placeholders_to_abspath($doc_source_file);
1417
1418 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1419 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1420 }
1421
1422 if ($doc_source_file ne $file) {
1423 # its an associated or metadata file
1424
1425 # mark source doc for reimport as one of its assoc files has changed or deleted
1426 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1427
1428 }
1429 my $curr_status = $archive_info->get_status_info($oid);
1430 if (defined($curr_status) && (($curr_status ne "D"))) {
1431 if ($verbosity>1) {
1432 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1433 }
1434 # mark oid for deletion (it will be deleted or reimported)
1435 $archive_info->set_status_info($oid,"D");
1436 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1437 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1438
1439 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1440 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1441
1442 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1443 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1444 }
1445 }
1446
1447 }
1448
1449 # now go through and check that we haven't marked any primary
1450 # files for reindex (because their associated files have
1451 # changed/deleted) when they have been deleted themselves. only in
1452 # delete mode.
1453
1454 if ($mode eq "delete") {
1455 foreach my $file (@$deleted_files) {
1456 if (defined $block_hash->{'reindex_files'}->{$file}) {
1457 delete $block_hash->{'reindex_files'}->{$file};
1458 }
1459 }
1460 }
1461
1462
1463}
1464
1465sub add_dir_contents_to_list {
1466
1467 my ($dirname, $list) = @_;
1468
1469 # Recur over directory contents.
1470 my (@dir, $subfile);
1471
1472 # find all the files in the directory
1473 if (!opendir (DIR, $dirname)) {
1474 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1475 return -1; # error in processing
1476 }
1477 @dir = readdir (DIR);
1478 closedir (DIR);
1479
1480 for (my $i = 0; $i < scalar(@dir); $i++) {
1481 my $subfile = $dir[$i];
1482 next if ($subfile =~ m/^\.\.?$/);
1483 next if ($subfile =~ /^\.svn$/);
1484 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1485 if (-d $full_file) {
1486 &add_dir_contents_to_list($full_file, $list);
1487 } else {
1488 push (@$list, $full_file);
1489 }
1490 }
1491
1492}
1493
1494
14951;
Note: See TracBrowser for help on using the repository browser.