source: main/trunk/greenstone2/perllib/inexport.pm@ 36471

Last change on this file since 36471 was 36471, checked in by anupama, 20 months ago

Dr Bainbridge's suggested bugfix worked: when incremental-building on windows, deleting items when the collection had a longer filename like manifest-demo-e failed with warnings from gdbm_delete about item for filepath as delete key not being found. Deleting worked in collections with shorter filenames (ultimately, full path to files being deleted were shorter). As Dr Bainbridge suggested, it merely required ensure that we compare fll/long filenames with what's in the database, as windows shortfilenames won't hit a match. Fix is a call to util::upgrade_if_dos_filename() to ensure we compare apples with apples. Tested on Windows binary GS2 installation and it worked, but copied the line to linux GS2 install as it's an SVNinstall and will allow me to commit.

  • Property svn:executable set to *
File size: 49.3 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use File::Basename;
50
51my $oidtype_list =
52 [ { 'name' => "hash",
53 'desc' => "{import.OIDtype.hash}" },
54 { 'name' => "hash_on_full_filename",
55 'desc' => "{import.OIDtype.hash_on_full_filename}" },
56 { 'name' => "assigned",
57 'desc' => "{import.OIDtype.assigned}" },
58 { 'name' => "incremental",
59 'desc' => "{import.OIDtype.incremental}" },
60 { 'name' => "filename",
61 'desc' => "{import.OIDtype.filename}" },
62 { 'name' => "dirname",
63 'desc' => "{import.OIDtype.dirname}" },
64 { 'name' => "full_filename",
65 'desc' => "{import.OIDtype.full_filename}" } ];
66
67$inexport::directory_arguments =
68 [
69 { 'name' => "importdir",
70 'desc' => "{import.importdir}",
71 'type' => "string",
72 'reqd' => "no",
73 'deft' => "import",
74 'hiddengli' => "yes" },
75 { 'name' => "collectdir",
76 'desc' => "{import.collectdir}",
77 'type' => "string",
78 # parsearg left "" as default
79 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
80 'deft' => "",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83
84 ];
85$inexport::arguments =
86 [
87 # don't set the default to hash - want to allow this to come from
88 # entry in collect.cfg but want to override it here
89 { 'name' => "OIDtype",
90 'desc' => "{import.OIDtype}",
91 'type' => "enum",
92 'list' => $oidtype_list,
93 'deft' => "hash_on_full_filename",
94 'reqd' => "no",
95 'modegli' => "2" },
96 { 'name' => "OIDmetadata",
97 'desc' => "{import.OIDmetadata}",
98 'type' => "string",
99 'deft' => "dc.Identifier",
100 'reqd' => "no",
101 'modegli' => "2" },
102 { 'name' => "site",
103 'desc' => "{import.site}",
104 'type' => "string",
105 'deft' => "",
106 'reqd' => "no",
107 'hiddengli' => "yes" },
108 { 'name' => "manifest",
109 'desc' => "{import.manifest}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "no",
113 'hiddengli' => "yes" } ,
114 { 'name' => "incremental",
115 'desc' => "{import.incremental}",
116 'type' => "flag",
117 'hiddengli' => "yes" },
118 { 'name' => "keepold",
119 'desc' => "{import.keepold}",
120 'type' => "flag",
121 'reqd' => "no",
122 'hiddengli' => "yes" },
123 { 'name' => "removeold",
124 'desc' => "{import.removeold}",
125 'type' => "flag",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "language",
129 'desc' => "{scripts.language}",
130 'type' => "string",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "maxdocs",
134 'desc' => "{import.maxdocs}",
135 'type' => "int",
136 'reqd' => "no",
137 'deft' => "-1",
138 'range' => "-1,",
139 'modegli' => "1" },
140 { 'name' => "debug",
141 'desc' => "{import.debug}",
142 'type' => "flag",
143 'reqd' => "no",
144 'hiddengli' => "yes" },
145 { 'name' => "faillog",
146 'desc' => "{import.faillog}",
147 'type' => "string",
148 # parsearg left "" as default
149 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
150 'deft' => "",
151 'reqd' => "no",
152 'modegli' => "3" },
153 { 'name' => "out",
154 'desc' => "{import.out}",
155 'type' => "string",
156 'deft' => "STDERR",
157 'reqd' => "no",
158 'hiddengli' => "yes" },
159 { 'name' => "statsfile",
160 'desc' => "{import.statsfile}",
161 'type' => "string",
162 'deft' => "STDERR",
163 'reqd' => "no",
164 'hiddengli' => "yes" },
165 { 'name' => "verbosity",
166 'desc' => "{import.verbosity}",
167 'type' => "int",
168 'range' => "0,",
169 'deft' => "2",
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "gli",
173 'desc' => "{scripts.gli}",
174 'type' => "flag",
175 'reqd' => "no",
176 'hiddengli' => "yes" },
177 { 'name' => "xml",
178 'desc' => "{scripts.xml}",
179 'type' => "flag",
180 'reqd' => "no",
181 'hiddengli' => "yes" },
182
183 ];
184
185sub new
186{
187 my $class = shift (@_);
188 my ($mode,$argv,$options,$opt_listall_options) = @_;
189
190 my $self = { 'xml' => 0, 'mode' => $mode };
191
192 # general options available to all plugins
193 my $arguments = $options->{'args'};
194 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
195 # Parse returns -1 if something has gone wrong
196 if ($intArgLeftinAfterParsing == -1)
197 {
198 &PrintUsage::print_txt_usage($options, "{import.params}",1);
199 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
200 die "\n";
201 }
202
203 my $language = $self->{'language'};
204 # If $language has been specified, load the appropriate resource bundle
205 # (Otherwise, the default resource bundle will be loaded automatically)
206 if ($language && $language =~ /\S/) {
207 &gsprintf::load_language_specific_resource_bundle($language);
208 }
209
210 if ($self->{'listall'}) {
211 if ($self->{'xml'}) {
212 &PrintUsage::print_xml_usage($opt_listall_options);
213 }
214 else
215 {
216 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
217 }
218 die "\n";
219 }
220
221 if ($self->{'xml'}) {
222 &PrintUsage::print_xml_usage($options);
223 print "\n";
224 return bless $self, $class;
225 }
226
227 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
228 &gsprintf::output_strings_in_UTF8;
229 }
230
231 # If the user specified -h, then we output the usage
232 if (@$argv && $argv->[0] =~ /^\-+h/) {
233 &PrintUsage::print_txt_usage($options, "{import.params}");
234 die "\n";
235 }
236 # now check that we had exactly one leftover arg, which should be
237 # the collection name. We don't want to do this earlier, cos
238 # -xml arg doesn't need a collection name
239
240 if ($intArgLeftinAfterParsing != 1 )
241 {
242 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
243 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
244 die "\n";
245 }
246
247 $self->{'close_out'} = 0;
248 my $out = $self->{'out'};
249 if ($out !~ /^(STDERR|STDOUT)$/i) {
250 open (OUT, ">$out") ||
251 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
252 $out = 'inexport::OUT';
253 $self->{'close_out'} = 1;
254 }
255 $out->autoflush(1);
256 $self->{'out'} = $out;
257
258 my $statsfile = $self->{'statsfile'};
259 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
260 open (STATSFILE, ">$statsfile") ||
261 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
262 $statsfile = 'inexport::STATSFILE';
263 $self->{'close_stats'} = 1;
264 }
265 $statsfile->autoflush(1);
266 $self->{'statsfile'} = $statsfile;
267
268 # @ARGV should be only one item, the name of the collection
269 $self->{'collection'} = shift @$argv;
270
271 # Unless otherwise stated all manifests are considered version 1---where
272 # they act more like an advanced process expression---as compared to newer
273 # manifest files that act as an explicit (and exhaustive) list of files to
274 # process [jmt12]
275 $self->{'manifest_version'} = 1;
276
277 return bless $self, $class;
278}
279
280# Simplified version of the contstructor for use with CGI scripts
281sub newCGI
282{
283 my $class = shift (@_);
284 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
285
286 my $self = { 'xml' => 0, 'mode' => $mode };
287
288 $self->{'out'} = STDERR;
289
290 if (defined $gsdl_cgi) {
291 $self->{'site'} = $opt_site;
292 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
293 $self->{'collectdir'} = $collect_dir;
294 }
295 else {
296 $self->{'site'} = "";
297 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
298 }
299 $self->{'faillog'} = "";
300
301 $self->{'collection'} = $collect;
302
303 return bless $self, $class;
304}
305sub get_collection
306{
307 my $self = shift @_;
308
309 return $self->{'collection'};
310}
311
312
313sub read_collection_cfg
314{
315 my $self = shift @_;
316 my ($collection,$options) = @_;
317
318 my $collectdir = $self->{'collectdir'};
319 my $site = $self->{'site'};
320 my $out = $self->{'out'};
321
322 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
323 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
324 die "\n";
325 }
326
327 # set gs_version 2/3
328 $self->{'gs_version'} = "2";
329 if ((defined $site) && ($site ne "")) {
330 # gs3
331 $self->{'gs_version'} = "3";
332 }
333
334 # add collection's perllib dir into include path in
335 # case we have collection specific modules
336 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
337
338 # check that we can open the faillog
339 my $faillog = $self->{'faillog'};
340 if ($faillog eq "") {
341 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
342 }
343 open (FAILLOG, ">$faillog") ||
344 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
345
346
347 my $faillogname = $faillog;
348 $faillog = 'inexport::FAILLOG';
349 $faillog->autoflush(1);
350 $self->{'faillog'} = $faillog;
351 $self->{'faillogname'} = $faillogname;
352 $self->{'close_faillog'} = 1;
353
354 # Read in the collection configuration file.
355 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
356 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
357
358 # store the config file's name, so oaiinfo object constructor can be instantiated with it
359 $self->{'config_filename'} = $config_filename;
360
361 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
362
363 return ($config_filename,$collectcfg);
364}
365
366sub set_collection_options
367{
368 my $self = shift @_;
369 my ($collectcfg) = @_;
370
371 my $inexport_mode = $self->{'mode'};
372
373 my $importdir = $self->{'importdir'};
374 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
375 my $out = $self->{'out'};
376
377 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
378 if (!defined($collectcfg->{'infodbtype'}))
379 {
380 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
381 }
382 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
383 # we can't use the text version for archives dbs.
384 $collectcfg->{'infodbtype'} = "gdbm";
385 }
386
387 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
388 $importdir = $collectcfg->{'importdir'};
389 }
390
391 if ($inexport_mode eq "import") {
392 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
393 $archivedir = $collectcfg->{'archivedir'};
394 }
395 }
396 elsif ($inexport_mode eq "export") {
397 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
398 $archivedir = $collectcfg->{'exportdir'};
399 }
400 }
401 # fill in the default import and archives directories if none
402 # were supplied, turn all \ into / and remove trailing /
403 if (!&FileUtils::isFilenameAbsolute($importdir))
404 {
405 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
406 }
407 else
408 {
409 # Don't do this - it kills protocol prefixes
410 #$importdir =~ s/[\\\/]+/\//g;
411 #$importdir =~ s/\/$//;
412 # Do this instead
413 &FileUtils::sanitizePath($importdir);
414 }
415
416 if (!&FileUtils::directoryExists($importdir))
417 {
418 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
419 die "\n";
420 }
421 $self->{'importdir'} = $importdir;
422
423 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
424 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
425 }
426 else {
427
428 $archivedir = &FileUtils::sanitizePath($archivedir);
429 }
430 $self->{'archivedir'} = $archivedir;
431
432 if (defined $self->{'default_verbosity'}) {
433 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
434 $self->{'verbosity'} = $collectcfg->{'verbosity'};
435 }
436 }
437
438 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
439 $self->{'manifest'} = $collectcfg->{'manifest'};
440 }
441
442 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
443 if ($collectcfg->{'gzip'} =~ /^true$/i) {
444 $self->{'gzip'} = 1;
445 }
446 }
447
448 if (defined $self->{'default_maxdocs'}) {
449 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
450 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
451 }
452 }
453
454
455
456 if (defined $self->{'default_OIDtype'} ) {
457 if (defined $collectcfg->{'OIDtype'}
458 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
459 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
460 }
461 }
462
463 if (defined $self->{'default_OIDmetadata'}) {
464 if (defined $collectcfg->{'OIDmetadata'}) {
465 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
466 }
467 }
468
469 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
470 $self->{'debug'} = 1;
471 }
472 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
473 $self->{'gli'} = 1;
474 }
475 $self->{'gli'} = 0 unless defined $self->{'gli'};
476
477 # check keepold and removeold
478 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
479
480 my ($removeold, $keepold, $incremental, $incremental_mode)
481 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
482 $self->{'incremental'}, $checkdir,
483 $collectcfg);
484
485 $self->{'removeold'} = $removeold;
486 $self->{'keepold'} = $keepold;
487 $self->{'incremental'} = $incremental;
488 $self->{'incremental_mode'} = $incremental_mode;
489
490 # Since this wasted my morning, let's at least warn a user that manifest
491 # files now *only* work if keepold is set [jmt12]
492 if ($self->{'manifest'} && (!$keepold || !$incremental))
493 {
494 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
495 }
496}
497
498sub process_files
499{
500 my $self = shift @_;
501 my ($config_filename,$collectcfg) = @_;
502
503 my $inexport_mode = $self->{'mode'};
504
505 my $verbosity = $self->{'verbosity'};
506 my $debug = $self->{'debug'};
507
508 my $importdir = $self->{'importdir'};
509 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
510
511 my $incremental = $self->{'incremental'};
512 my $incremental_mode = $self->{'incremental_mode'};
513
514 my $gs_version = $self->{'gs_version'};
515
516 my $removeold = $self->{'removeold'};
517 my $keepold = $self->{'keepold'};
518
519 my $saveas = $self->{'saveas'};
520 my $saveas_options = $self->{'saveas_options'};
521 my $OIDtype = $self->{'OIDtype'};
522 my $OIDmetadata = $self->{'OIDmetadata'};
523
524 my $out = $self->{'out'};
525 my $faillog = $self->{'faillog'};
526
527 my $maxdocs = $self->{'maxdocs'};
528 my $gzip = $self->{'gzip'};
529 my $groupsize = $self->{'groupsize'};
530 my $sortmeta = $self->{'sortmeta'};
531
532 my $removeprefix = $self->{'removeprefix'};
533 my $removesuffix = $self->{'removesuffix'};
534
535 my $gli = $self->{'gli'};
536
537 # related to export
538 my $xsltfile = $self->{'xsltfile'};
539 my $group_marc = $self->{'group_marc'};
540 my $mapping_file = $self->{'mapping_file'};
541 my $xslt_mets = $self->{'xslt_mets'};
542 my $xslt_txt = $self->{'xslt_txt'};
543 my $fedora_namespace = $self->{'fedora_namespace'};
544 my $metadata_prefix = $self->{'metadata_prefix'};
545
546 if ($inexport_mode eq "import") {
547 print STDERR "<Import>\n" if $gli;
548 }
549 else {
550 print STDERR "<export>\n" if $gli;
551 }
552
553 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
554 if ($self->{'manifest'} ne "") {
555 my $manifest_filename = $self->{'manifest'};
556
557 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
558 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
559 }
560 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
561 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
562 #$self->{'manifest'} =~ s/\/$//;
563
564 $manifest_lookup->parse($manifest_filename);
565
566 # manifests may now include a version number [jmt12]
567 $self->{'manifest_version'} = $manifest_lookup->get_version();
568 }
569
570 my $manifest = $self->{'manifest'};
571
572 # load all the plugins
573 my $plugins = [];
574 if (defined $collectcfg->{'plugin'}) {
575 $plugins = $collectcfg->{'plugin'};
576 }
577
578 my $plugin_incr_mode = $incremental_mode;
579 if ($manifest ne "") {
580 # if we have a manifest file, then we pretend we are fully incremental for plugins
581 $plugin_incr_mode = "all";
582 }
583 #some global options for the plugins
584 my @global_opts = ();
585
586 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
587 if (scalar(@$pluginfo) == 0) {
588 &gsprintf($out, "{import.no_plugins_loaded}\n");
589 die "\n";
590 }
591
592 # remove the old contents of the archives directory (and tmp
593 # directory) if needed
594
595 if ($removeold) {
596 if (&FileUtils::directoryExists($archivedir)) {
597 &gsprintf($out, "{import.removing_archives}\n");
598 &FileUtils::removeFilesRecursive($archivedir);
599 }
600 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
601 $tmpdir =~ s/[\\\/]+/\//g;
602 $tmpdir =~ s/\/$//;
603 if (&FileUtils::directoryExists($tmpdir)) {
604 &gsprintf($out, "{import.removing_tmpdir}\n");
605 &FileUtils::removeFilesRecursive($tmpdir);
606 }
607 }
608
609 # create the archives dir if needed
610 &FileUtils::makeAllDirectories($archivedir);
611
612 # read the archive information file
613
614 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
615 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
616 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
617
618 # When we make these initial calls to determine the archive information doc
619 # and src databases we pass through a '1' to indicate this is the first
620 # time we are referring to these databases. When using dynamic dbutils
621 # (available in extensions) this indicates to some database types (for
622 # example, persistent servers) that this is a good time to perform any
623 # one time initialization. The argument has no effect on vanilla dbutils
624 # [jmt12]
625 my $perform_firsttime_init = 1;
626 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
627 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
628
629 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
630 $archive_info->load_info ($arcinfo_doc_filename);
631 # load in rev info so we don't overwrite existing info when we do incremental import
632 # from here on, make all changes to this object, then write out the file at the end.
633 $archive_info->load_rev_info($arcinfo_src_filename);
634
635 if ($manifest eq "") {
636 # Load in list of files in import folder from last import (if present)
637 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
638 }
639
640 ####Use Plugout####
641 my $plugout;
642
643 my $generate_auxiliary_files = 0;
644 if ($inexport_mode eq "import") {
645 $generate_auxiliary_files = 1;
646 }
647 elsif ($self->{'include_auxiliary_database_files'}) {
648 $generate_auxiliary_files = 1;
649 }
650 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
651
652 # Option to use user defined plugout
653 if ($inexport_mode eq "import") {
654 if (defined $collectcfg->{'plugout'}) {
655 # If a plugout was specified in the collect.cfg file, assume it is sensible
656 # We can't check the name because it could be anything, if it is a custom plugout
657 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
658 $plugout = $collectcfg->{'plugout'};
659 }
660 else {
661 push @$plugout,$saveas."Plugout";
662 }
663
664 }
665 else {
666 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
667 $plugout = $collectcfg->{'plugout'};
668 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
669 }
670 else {
671 push @$plugout,$saveas."Plugout";
672 }
673 }
674
675 my $plugout_name = $plugout->[0];
676
677 if (defined $saveas_options) {
678 my @user_plugout_options = split(" ", $saveas_options);
679 push @$plugout, @user_plugout_options;
680 }
681 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
682 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
683 push @$plugout,("-debug") if ($debug);
684 push @$plugout,("-gzip_output") if ($gzip);
685 push @$plugout,("-output_handle",$out) if (defined $out);
686 push @$plugout,("-site",$self->{'site'}) if (defined $self->{'site'});
687
688 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
689 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
690 if ($inexport_mode eq "import") {
691 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
692 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
693 }
694 }
695 my $processor = &plugout::load_plugout($plugout);
696 $processor->setoutputdir ($archivedir);
697 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
698 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
699 $processor->begin();
700 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
701
702 if ($removeold) {
703 # occasionally, plugins may want to do something on remove
704 # old, eg pharos image indexing
705 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
706 }
707
708 # process the import directory
709 my $block_hash = {};
710 $block_hash->{'new_files'} = {};
711 $block_hash->{'reindex_files'} = {};
712 # all of these are set somewhere else, so it's more readable to define them
713 # here [jmt12]
714 $block_hash->{'all_files'} = {};
715 $block_hash->{'deleted_files'} = {};
716 $block_hash->{'file_blocks'} = {};
717 $block_hash->{'metadata_files'} = {};
718 $block_hash->{'shared_fileroot'} = '';
719 $block_hash->{'manifest'} = 'false';
720 my $metadata = {};
721
722 # global blocking pass may set up some metadata
723 # does this set up metadata?????
724 # - when we have a newer manifest file we don't do this -unless- the
725 # collection configuration indicates this collection contains complex
726 # (inherited) metadata [jmt12]
727 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
728 {
729 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
730 }
731 else
732 {
733 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
734 }
735
736
737 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
738 # of the OAI identifiers with their time stamps and deleted status.
739 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
740 my $have_manifest = ($manifest eq '') ? 0 : 1;
741 $oai_info->import_stage($removeold, $have_manifest);
742
743
744 if ($manifest ne "") {
745
746 # mark that we are using a manifest - information that might be needed
747 # down in plugins (for instance DirectoryPlugin)
748 $block_hash->{'manifest'} = $self->{'manifest_version'};
749
750 #
751 # 1. Process delete files first
752 #
753 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
754 my @full_deleted_files = ();
755
756 # ensure all filenames are absolute
757 foreach my $df (@deleted_files) {
758 my $full_df =
759 (&FileUtils::isFilenameAbsolute($df))
760 ? $df
761 : &FileUtils::filenameConcatenate($importdir,$df);
762
763 # gdb doesn't store short filenames, so ensure we specify full filenames for deletion
764 $full_df = &util::upgrade_if_dos_filename($full_df); # will only do something on windows
765
766 if (-d $full_df) {
767 &add_dir_contents_to_list($full_df, \@full_deleted_files);
768 } else {
769 push(@full_deleted_files,$full_df);
770 }
771 }
772
773 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
774 mark_docs_for_deletion($archive_info,{},
775 \@full_deleted_files,
776 $archivedir, $verbosity, "delete");
777
778
779 #
780 # 2. Now files for reindexing
781 #
782
783 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
784 my @full_reindex_files = ();
785 # ensure all filenames are absolute
786 foreach my $rf (@reindex_files) {
787 my $full_rf =
788 (&FileUtils::isFilenameAbsolute($rf))
789 ? $rf
790 : &FileUtils::filenameConcatenate($importdir,$rf);
791
792 if (-d $full_rf) {
793 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
794 } else {
795 push(@full_reindex_files,$full_rf);
796 }
797 }
798
799 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
800 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
801
802 # And now to ensure the new version of the file processed by
803 # appropriate plugin, we need to add it to block_hash reindex list
804 foreach my $full_rf (@full_reindex_files) {
805 $block_hash->{'reindex_files'}->{$full_rf} = 1;
806 }
807
808
809 #
810 # 3. Now finally any new files - add to block_hash new_files list
811 #
812
813 my @new_files = keys %{$manifest_lookup->{'index'}};
814 my @full_new_files = ();
815
816 foreach my $nf (@new_files) {
817 # ensure filename is absolute
818 my $full_nf =
819 (&FileUtils::isFilenameAbsolute($nf))
820 ? $nf
821 : &FileUtils::filenameConcatenate($importdir,$nf);
822
823 if (-d $full_nf) {
824 &add_dir_contents_to_list($full_nf, \@full_new_files);
825 } else {
826 push(@full_new_files,$full_nf);
827 }
828 }
829
830 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
831 # need to check this file exists before trying to read it - in the past
832 # it wasn't possible to have a manifest unless keepold was also set so
833 # you were pretty much guaranteed arcinfo existed
834 # [jmt12]
835 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
836 if (-e $arcinfo_src_filename)
837 {
838 my $arcinfodb_map = {};
839 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
840 foreach my $f (@full_new_files) {
841 my $rel_f = &util::abspath_to_placeholders($f);
842
843 # check that we haven't seen it already
844 if (defined $arcinfodb_map->{$rel_f}) {
845 # TODO make better warning
846 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
847 } else {
848 $block_hash->{'new_files'}->{$f} = 1;
849 }
850 }
851
852 undef $arcinfodb_map;
853 }
854 # no existing files - so we can just add all the files [jmt12]
855 else
856 {
857 foreach my $f (@full_new_files)
858 {
859 $block_hash->{'new_files'}->{$f} = 1;
860 }
861 }
862
863 # If we are not using complex inherited metadata (and thus have skipped
864 # the global file scan) we need to at least check for a matching
865 # metadata.xml for the files being indexed/reindexed
866 # - unless we are using the newer version of Manifests, which are treated
867 # verbatim, and should have a metadata element for metadata files (so
868 # we can explicitly process metadata files other than metadata.xml)
869 # [jmt12]
870 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
871 {
872 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
873 foreach my $file_to_import (@all_files_to_import)
874 {
875 my $metadata_xml_path = $file_to_import;
876 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
877 if (&FileUtils::fileExists($metadata_xml_path))
878 {
879 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
880 }
881 }
882 }
883
884 # new version manifest files explicitly list metadata files to be
885 # processed (ignoring complexmeta if set)
886 # [jmt12]
887 if ($self->{'manifest_version'} > 1)
888 {
889 # Process metadata files
890 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
891 {
892 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
893 }
894 }
895 } # end if (manifest ne "")
896 else {
897 # if incremental, we read through the import folder to see whats changed.
898
899 if ($incremental || $incremental_mode eq "onlyadd") {
900 prime_doc_oid_count($archivedir);
901
902 # Can now work out which files were new, already existed, and have
903 # been deleted
904
905 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
906 $archivedir,$verbosity,$incremental_mode);
907
908 my @new_files = sort keys %{$block_hash->{'new_files'}};
909 if (scalar(@new_files>0)) {
910 print STDERR "New files and modified metadata files since last import:\n ";
911 print STDERR join("\n ",@new_files), "\n";
912 }
913
914 if ($incremental) {
915 # only look for deletions if we are truely incremental
916 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
917 # Filter out any in gsdl/tmp area
918 my @filtered_deleted_files = ();
919 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
920 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
921 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
922 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
923
924 foreach my $df (@deleted_files) {
925 next if ($df =~ m/^$gsdl_tmp_area/);
926 next if ($df =~ m/^$collect_tmp_area/);
927
928 push(@filtered_deleted_files,$df);
929 }
930
931
932 @deleted_files = @filtered_deleted_files;
933
934 if (scalar(@deleted_files)>0) {
935 print STDERR "Files deleted since last import:\n ";
936 print STDERR join("\n ",@deleted_files), "\n";
937
938
939 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
940
941 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
942 }
943
944 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
945
946 if (scalar(@reindex_files)>0) {
947 print STDERR "Files to reindex since last import:\n ";
948 print STDERR join("\n ",@reindex_files), "\n";
949 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
950 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
951 }
952
953 }
954 } # end if incremental/only_add mode
955 # else no manifest AND not incremental
956 } # end if else block of manifest ne "" else eq ""
957
958 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
959 # Do nothing if the file already exists (file exists on incremental build).
960 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
961 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
962 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
963 # oailastmodified and oailastmodifieddate
964 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
965 if ($self->{'generate_auxiliary_files'}) {
966 if (!-f $earliestDatestampFile && -d $archivedir) {
967 my $current_time_in_seconds = time; # in seconds
968
969 if(open(FOUT, ">$earliestDatestampFile")) {
970 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
971 print FOUT $current_time_in_seconds;
972 close(FOUT);
973 }
974 else {
975 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
976 }
977
978 }
979 }
980
981 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
982
983 if ($saveas eq "FedoraMETS") {
984 # create collection "doc obj" for Fedora that contains
985 # collection-level metadata
986
987 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
988 $doc_obj->set_OID("collection");
989
990 my $col_name = undef;
991 my $col_meta = $collectcfg->{'collectionmeta'};
992
993 if (defined $col_meta) {
994 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
995 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
996 }
997 $processor->process($doc_obj);
998 }
999
1000 &plugin::end($pluginfo, $processor);
1001
1002 &plugin::deinit($pluginfo, $processor);
1003
1004 # Store the value of OIDCount (used in doc.pm) so it can be
1005 # restored correctly to this value on an incremental build
1006 # - this OIDcount file should only be generated for numerical oids [jmt12]
1007 if ($self->{'OIDtype'} eq 'incremental')
1008 {
1009 store_doc_oid_count($archivedir);
1010 }
1011
1012 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1013 $processor->close_group_output() if $processor->is_group();
1014 $processor->end();
1015
1016 # if ($inexport_mode eq "import") {
1017 if ($self->{'generate_auxiliary_files'}) {
1018 # write out the archive information file
1019 # for backwards compatability with archvies.inf file
1020 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1021 $archive_info->save_info($arcinfo_doc_filename);
1022 }
1023 else {
1024 $archive_info->save_revinfo_db($arcinfo_src_filename);
1025 }
1026 }
1027 return $pluginfo;
1028}
1029
1030# @function perform_process_files()
1031# while process_files() above prepares the system to import files this is the
1032# function that actually initiates the plugin pipeline to process the files.
1033# This function should therefore be overridden in subclasses of inexport.pm should
1034# they wish to do different or further processing
1035# @author jmt12
1036sub perform_process_files
1037{
1038 my $self = shift(@_);
1039 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1040 my $gli = $self->{'gli'};
1041 # specific file to process - via manifest version 2+
1042 if ($file_to_import ne '')
1043 {
1044 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1045 }
1046 # global file scan - if we are using a new version manifest, files would have
1047 # been read above. Older manifests use extra settings in the $block_hash to
1048 # control what is imported, while non-manifest imports use a regular
1049 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1050 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1051 {
1052 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1053 }
1054 else
1055 {
1056 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1057 }
1058}
1059# perform_process_files()
1060
1061# @function generate_statistics()
1062sub generate_statistics
1063{
1064 my $self = shift @_;
1065 my ($pluginfo) = @_;
1066
1067 my $inexport_mode = $self->{'mode'};
1068 my $out = $self->{'out'};
1069 my $faillogname = $self->{'faillogname'};
1070 my $statsfile = $self->{'statsfile'};
1071 my $gli = $self->{'gli'};
1072
1073 &gsprintf($out, "\n");
1074 &gsprintf($out, "*********************************************\n");
1075 &gsprintf($out, "{$inexport_mode.complete}\n");
1076 &gsprintf($out, "*********************************************\n");
1077
1078 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1079}
1080# generate_statistics()
1081
1082
1083# @function deinit()
1084# Close down any file handles that we opened (and hence are responsible for
1085# closing
1086sub deinit
1087{
1088 my $self = shift(@_);
1089 close OUT if $self->{'close_out'};
1090 close FAILLOG if $self->{'close_faillog'};
1091 close STATSFILE if $self->{'close_statsfile'};
1092}
1093# deinit()
1094
1095
1096sub store_collectionmeta
1097{
1098 my ($collectionmeta,$field,$doc_obj) = @_;
1099
1100 my $section = $doc_obj->get_top_section();
1101
1102 my $field_hash = $collectionmeta->{$field};
1103
1104 foreach my $k (keys %$field_hash)
1105 {
1106 my $val = $field_hash->{$k};
1107
1108 ### print STDERR "*** $k = $field_hash->{$k}\n";
1109
1110 my $md_label = "ex.$field";
1111
1112
1113 if ($k =~ m/^\[l=(.*?)\]$/)
1114 {
1115
1116 my $md_suffix = $1;
1117 $md_label .= "^$md_suffix";
1118 }
1119
1120
1121 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1122
1123 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1124 # while "collectionname" in GS2 is called "name" in GS3.
1125 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1126 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1127 {
1128 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1129 }
1130
1131 }
1132}
1133
1134
1135sub oid_count_file {
1136 my ($archivedir) = @_;
1137 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1138}
1139
1140
1141sub prime_doc_oid_count
1142{
1143 my ($archivedir) = @_;
1144 my $oid_count_filename = &oid_count_file($archivedir);
1145
1146 if (-e $oid_count_filename) {
1147 if (open(OIDIN,"<$oid_count_filename")) {
1148 my $OIDcount = <OIDIN>;
1149 chomp $OIDcount;
1150 close(OIDIN);
1151
1152 $doc::OIDcount = $OIDcount;
1153 }
1154 else {
1155 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1156 }
1157 }
1158
1159}
1160
1161sub store_doc_oid_count
1162{
1163 # Use the file "OIDcount" in the archives directory to record
1164 # what value doc.pm got up to
1165
1166 my ($archivedir) = @_;
1167 my $oid_count_filename = &oid_count_file($archivedir);
1168
1169 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1170 if (open(OIDOUT,">$oid_count_filename")) {
1171 print OIDOUT $doc::OIDcount, "\n";
1172
1173 close(OIDOUT);
1174 }
1175 else {
1176 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1177 }
1178}
1179
1180
1181
1182sub new_vs_old_import_diff
1183{
1184 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1185
1186 # Get the infodbtype value for this collection from the arcinfo object
1187 my $infodbtype = $archive_info->{'infodbtype'};
1188
1189 # in this method, we want to know if metadata files are modified or not.
1190 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1191
1192 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1193
1194 # First convert all files to absolute form
1195 # This is to support the situation where the import folder is not
1196 # the default
1197
1198 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1199 my $full_prev_all_files = {};
1200
1201 foreach my $prev_file (keys %$prev_all_files) {
1202 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions.
1203 $prev_file = &util::upgrade_if_dos_filename($prev_file);
1204
1205 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1206 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1207 $full_prev_all_files->{$full_prev_file} = $prev_file;
1208 }
1209 else {
1210 $full_prev_all_files->{$prev_file} = $prev_file;
1211 }
1212 }
1213
1214
1215 # Figure out which are the new files, existing files and so
1216 # by implication the files from the previous import that are not
1217 # there any more => mark them for deletion
1218 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1219
1220 my $full_curr_file = $curr_file;
1221
1222 # entry in 'all_files' is moved to either 'existing_files',
1223 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1224
1225 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1226 # add in import dir to make absolute
1227 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1228 }
1229
1230 # figure out if new file or not
1231 if (defined $full_prev_all_files->{$full_curr_file}) {
1232 # delete it so that only files that need deleting are left
1233 delete $full_prev_all_files->{$full_curr_file};
1234 # had it before. is it a metadata file?
1235 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1236 # is it modified??
1237 if (-M $full_curr_file < $archiveinf_timestamp) {
1238 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1239 # its newer than last build
1240 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1241 }
1242 }
1243 else {
1244 if ($incremental_mode eq "all") {
1245
1246 # had it before
1247 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1248
1249 }
1250 else {
1251 # Warning in "onlyadd" mode, but had it before!
1252 print STDERR "Warning: File $full_curr_file previously imported.\n";
1253 print STDERR " Treating as new file\n";
1254
1255 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1256
1257 }
1258 }
1259 }
1260 else {
1261 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1262 # the new file is the special sort of file greenstone uses
1263 # to attach metadata to src documents
1264 # i.e metadata.xml
1265 # (but note, the filename used is not constrained in
1266 # Greenstone to always be this)
1267
1268 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1269 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1270 }
1271 else {
1272 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1273 }
1274 }
1275
1276
1277 delete $block_hash->{'all_files'}->{$curr_file};
1278 }
1279
1280
1281
1282
1283 # Deal with complication of new or modified metadata files by forcing
1284 # everything from this point down in the file hierarchy to
1285 # be freshly imported.
1286 #
1287 # This may mean files that have not changed are reindexed, but does
1288 # guarantee by the end of processing all new metadata is correctly
1289 # associated with the relevant document(s).
1290
1291 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1292 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1293
1294 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1295 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1296
1297 # Go through existing_files, and mark anything that is contained
1298 # within 'situated_dir' to be reindexed (in case some of the metadata
1299 # attaches to one of these files)
1300
1301 my $reindex_files = [];
1302
1303 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1304
1305 if ($existing_f =~ m/^$situated_dir/) {
1306
1307 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1308
1309 push(@$reindex_files,$existing_f);
1310 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1311 delete $block_hash->{'existing_files'}->{$existing_f};
1312
1313 }
1314 }
1315
1316 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1317 # (or equivalent)
1318 $block_hash->{'new_files'}->{$new_mdf} = 1;
1319
1320 }
1321
1322 # go through remaining existing files and work out what has changed and needs to be reindexed.
1323 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1324
1325 my $reindex_files = [];
1326
1327 foreach my $existing_filename (@existing_files) {
1328 if (-M $existing_filename < $archiveinf_timestamp) {
1329 # file is newer than last build
1330
1331 my $existing_file = $existing_filename;
1332 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1333
1334 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1335 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1336
1337 print STDERR "**** Reindexing existing file: $existing_file\n";
1338
1339 push(@$reindex_files,$existing_file);
1340 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1341 }
1342
1343 }
1344
1345
1346 # By this point full_prev_all_files contains the files
1347 # mentioned in archiveinf-src.db but are not in the 'import'
1348 # folder (or whatever was specified through -importdir ...)
1349
1350 # This list can contain files that were created in the 'tmp' or
1351 # 'cache' areas (such as screen-size and thumbnail images).
1352 #
1353 # In building the final list of files to delete, we test to see if
1354 # it exists on the filesystem and if it does (unusual for a "normal"
1355 # file in import, but possible in the case of 'tmp' files),
1356 # supress it from going into the final list
1357
1358 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1359
1360 my @deleted_files = values %$full_prev_all_files;
1361 map { my $curr_file = $_;
1362 my $full_curr_file = $curr_file;
1363
1364 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1365 # add in import dir to make absolute
1366
1367 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1368 }
1369
1370
1371 if (!-e $full_curr_file) {
1372 $curr_file = &util::upgrade_if_dos_filename($curr_file);
1373 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1374 }
1375 } @deleted_files;
1376
1377
1378
1379}
1380
1381
1382# this is used to delete "deleted" docs and to remove old versions of "changed" docs
1383# $mode is 'delete' or 'reindex'
1384sub mark_docs_for_deletion
1385{
1386 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1387
1388 my $mode_text = "deleted from index";
1389 if ($mode eq "reindex") {
1390 $mode_text = "reindexed";
1391 }
1392
1393 # Get the infodbtype value for this collection from the arcinfo object
1394 my $infodbtype = $archive_info->{'infodbtype'};
1395
1396 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1397 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1398
1399
1400 # record files marked for deletion in arcinfo
1401 foreach my $file (@$deleted_files) {
1402 # use 'archiveinf-src' info database file to look up all the OIDs
1403 # that this file is used in (note in most cases, it's just one OID)
1404
1405 my $downgraded_file = &util::downgrade_if_dos_filename($file);
1406 my $oids = $archive_info->get_reverseinfo($downgraded_file);
1407 $archive_info->remove_reverseinfo($downgraded_file);
1408
1409 foreach my $oid (@$oids) {
1410 # get the record for this OID from doc db
1411 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1412 # find the source doc (the primary file that becomes this oid)
1413 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1414 $doc_source_file = &util::placeholders_to_abspath($doc_source_file, "long");
1415
1416 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1417 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1418 }
1419
1420 if ($doc_source_file ne $file) {
1421 # its an associated or metadata file
1422 # mark source doc for reimport as one of its assoc files has changed or deleted
1423 #$doc_source_file = &util::upgrade_if_dos_filename($doc_source_file);
1424 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1425
1426 } else {
1427
1428 # the file to be deleted/reindexed is a primary file. We need to remove all references to this in the src db
1429 my $assoc_files = $doc_rec->{'assoc-file'};
1430 foreach my $assocfile (@$assoc_files) {
1431 $assocfile = &util::placeholders_to_abspath($assocfile);
1432 $archive_info->remove_reverseinfo($assocfile, $oid);
1433 if (!defined $archive_info->get_reverseinfo($assocfile)) {
1434 # nothing refers to it anymore, mark for reindex.
1435 # block hash needs full filenames
1436 $assocfile = &util::upgrade_if_dos_filename($assocfile);
1437 $block_hash->{'reindex_files'}->{$assocfile} = 1;
1438 }
1439 }
1440
1441 }
1442 my $curr_status = $archive_info->get_status_info($oid);
1443 if (defined($curr_status) && (($curr_status ne "D"))) {
1444 if ($verbosity>1) {
1445 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1446 }
1447 # mark oid for deletion (it will be deleted or reimported)
1448 $archive_info->set_status_info($oid,"D");
1449 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1450 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1451
1452 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1453 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1454
1455 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1456 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1457 }
1458 }
1459
1460 }
1461
1462 # now go through and check that we haven't marked any primary
1463 # files for reindex (because their associated files have
1464 # changed/deleted) when they have been deleted themselves. only in
1465 # delete mode.
1466
1467 if ($mode eq "delete") {
1468 foreach my $file (@$deleted_files) {
1469 if (defined $block_hash->{'reindex_files'}->{$file}) {
1470 delete $block_hash->{'reindex_files'}->{$file};
1471 }
1472 }
1473 }
1474
1475
1476}
1477
1478sub add_dir_contents_to_list {
1479
1480 my ($dirname, $list) = @_;
1481
1482 # Recur over directory contents.
1483 my (@dir, $subfile);
1484
1485 # find all the files in the directory
1486 if (!opendir (DIR, $dirname)) {
1487 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1488 return -1; # error in processing
1489 }
1490 @dir = readdir (DIR);
1491 closedir (DIR);
1492
1493 for (my $i = 0; $i < scalar(@dir); $i++) {
1494 my $subfile = $dir[$i];
1495 next if ($subfile =~ m/^\.\.?$/);
1496 next if ($subfile =~ /^\.svn$/);
1497 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1498 if (-d $full_file) {
1499 &add_dir_contents_to_list($full_file, $list);
1500 } else {
1501 push (@$list, $full_file);
1502 }
1503 }
1504
1505}
1506
1507
15081;
Note: See TracBrowser for help on using the repository browser.