source: main/trunk/greenstone2/perllib/inexport.pm@ 37152

Last change on this file since 37152 was 37152, checked in by davidb, 16 months ago

The commit adds the newly developed 'File-Level Document-Version History' feature (fldv-history for short in the code). The work targets the 'archives' directory area. The core idea is to have older versions of a archives' document folder contained inside the latest generated version. To achieve this, inside an archives' document folder there can now be a '_fldv_history' folder, and inside that have sub-folders conforming the patter 'nminus-1', 'nminus-2', ... These are the older versions of the archives' document. The filenames are literally as just typed: nminus-1 contains the most recent stored version of the document; nminus-2 (if exists) is the second most recent version, and so on. When import.pl is run with -incremental and -keepold then any existing documents that need to be re-processed will trigger the formation of _fldv_history/nminus-1, storing the previous version in it. If import.pl -incremental -keepold is run again and the doc has changed again, then nminus-1 is moved to nminus-2, and a new nminus-1 is generated. With the addition of this new feature, there is a use-case of running -keepold without -incremental (and for it to be 'addonly'). To be clear, the 'onlyadd' functionality still works, however the code now does extra work to ensure that any existing documents in archives get the file-level document-version history treatment as well. This allows for a collection building to manually add content into import (even choose to leave existing content previously processed there if they want), and when import.pl -keepold is next run then a 'collection-wide' file-level document-version history of existing documents is triggered in 'archives'. The idea of a 'colletion-wide' (global) document history mechanism could be a useful way for a user to manage their collection. Hardlinking is used throughout the new code, so that occurs on the file system is not particularly expensive, although the overall collection build takes longer than 'onlyadd' as it does reprocess off the existing documents again. In the case of a user running import.pl -keepold and realizing this was in fact a mistake, there a new minus option '-replaceold'. This works in much the same way as 'keepold' only when it comes to the file-level document-version history feature, it does not add in yet another stored document version, rather it replaces the one previously stored at 'nminus-1' with this one, effectively undoing the 'mistake' of the previous build.

  • Property svn:executable set to *
File size: 53.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use oaiinfo;
38use plugin;
39use plugout;
40use manifest;
41use inexport;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf 'gsprintf';
46use printusage;
47use parse2;
48
49use DocHistoryFileUtils;
50use FileUtils;
51
52use File::Basename;
53
54my $oidtype_list =
55 [ { 'name' => "hash",
56 'desc' => "{import.OIDtype.hash}" },
57 { 'name' => "hash_on_full_filename",
58 'desc' => "{import.OIDtype.hash_on_full_filename}" },
59 { 'name' => "assigned",
60 'desc' => "{import.OIDtype.assigned}" },
61 { 'name' => "incremental",
62 'desc' => "{import.OIDtype.incremental}" },
63 { 'name' => "filename",
64 'desc' => "{import.OIDtype.filename}" },
65 { 'name' => "dirname",
66 'desc' => "{import.OIDtype.dirname}" },
67 { 'name' => "full_filename",
68 'desc' => "{import.OIDtype.full_filename}" } ];
69
70$inexport::directory_arguments =
71 [
72 { 'name' => "importdir",
73 'desc' => "{import.importdir}",
74 'type' => "string",
75 'reqd' => "no",
76 'deft' => "import",
77 'hiddengli' => "yes" },
78 { 'name' => "collectdir",
79 'desc' => "{import.collectdir}",
80 'type' => "string",
81 # parsearg left "" as default
82 #'deft' => &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect"),
83 'deft' => "",
84 'reqd' => "no",
85 'hiddengli' => "yes" },
86
87 ];
88$inexport::arguments =
89 [
90 # don't set the default to hash - want to allow this to come from
91 # entry in collect.cfg but want to override it here
92 { 'name' => "OIDtype",
93 'desc' => "{import.OIDtype}",
94 'type' => "enum",
95 'list' => $oidtype_list,
96 'deft' => "hash_on_full_filename",
97 'reqd' => "no",
98 'modegli' => "2" },
99 { 'name' => "OIDmetadata",
100 'desc' => "{import.OIDmetadata}",
101 'type' => "string",
102 'deft' => "dc.Identifier",
103 'reqd' => "no",
104 'modegli' => "2" },
105 { 'name' => "site",
106 'desc' => "{import.site}",
107 'type' => "string",
108 'deft' => "",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111 { 'name' => "manifest",
112 'desc' => "{import.manifest}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no",
116 'hiddengli' => "yes" } ,
117 { 'name' => "incremental",
118 'desc' => "{import.incremental}",
119 'type' => "flag",
120 'hiddengli' => "yes" },
121 { 'name' => "keepold",
122 'desc' => "{import.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'hiddengli' => "yes" },
126 { 'name' => "replaceold",
127 'desc' => "{import.replaceold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'hiddengli' => "yes" },
131 { 'name' => "removeold",
132 'desc' => "{import.removeold}",
133 'type' => "flag",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => "language",
137 'desc' => "{scripts.language}",
138 'type' => "string",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "maxdocs",
142 'desc' => "{import.maxdocs}",
143 'type' => "int",
144 'reqd' => "no",
145 'deft' => "-1",
146 'range' => "-1,",
147 'modegli' => "1" },
148 { 'name' => "debug",
149 'desc' => "{import.debug}",
150 'type' => "flag",
151 'reqd' => "no",
152 'hiddengli' => "yes" },
153 { 'name' => "faillog",
154 'desc' => "{import.faillog}",
155 'type' => "string",
156 # parsearg left "" as default
157 #'deft' => &FileUtils::filenameConcatenate("<collectdir>", "colname", "etc", "fail.log"),
158 'deft' => "",
159 'reqd' => "no",
160 'modegli' => "3" },
161 { 'name' => "out",
162 'desc' => "{import.out}",
163 'type' => "string",
164 'deft' => "STDERR",
165 'reqd' => "no",
166 'hiddengli' => "yes" },
167 { 'name' => "statsfile",
168 'desc' => "{import.statsfile}",
169 'type' => "string",
170 'deft' => "STDERR",
171 'reqd' => "no",
172 'hiddengli' => "yes" },
173 { 'name' => "verbosity",
174 'desc' => "{import.verbosity}",
175 'type' => "int",
176 'range' => "0,",
177 'deft' => "2",
178 'reqd' => "no",
179 'modegli' => "3" },
180 { 'name' => "gli",
181 'desc' => "{scripts.gli}",
182 'type' => "flag",
183 'reqd' => "no",
184 'hiddengli' => "yes" },
185 { 'name' => "xml",
186 'desc' => "{scripts.xml}",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190
191 ];
192
193sub new
194{
195 my $class = shift (@_);
196 my ($mode,$argv,$options,$opt_listall_options) = @_;
197
198 my $self = { 'xml' => 0, 'mode' => $mode };
199
200 # general options available to all plugins
201 my $arguments = $options->{'args'};
202 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
203 # Parse returns -1 if something has gone wrong
204 if ($intArgLeftinAfterParsing == -1)
205 {
206 &PrintUsage::print_txt_usage($options, "{import.params}",1);
207 print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
208 die "\n";
209 }
210
211 my $language = $self->{'language'};
212 # If $language has been specified, load the appropriate resource bundle
213 # (Otherwise, the default resource bundle will be loaded automatically)
214 if ($language && $language =~ /\S/) {
215 &gsprintf::load_language_specific_resource_bundle($language);
216 }
217
218 if ($self->{'listall'}) {
219 if ($self->{'xml'}) {
220 &PrintUsage::print_xml_usage($opt_listall_options);
221 }
222 else
223 {
224 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
225 }
226 die "\n";
227 }
228
229 if ($self->{'xml'}) {
230 &PrintUsage::print_xml_usage($options);
231 print "\n";
232 return bless $self, $class;
233 }
234
235 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
236 &gsprintf::output_strings_in_UTF8;
237 }
238
239 # If the user specified -h, then we output the usage
240 if (@$argv && $argv->[0] =~ /^\-+h/) {
241 &PrintUsage::print_txt_usage($options, "{import.params}");
242 die "\n";
243 }
244 # now check that we had exactly one leftover arg, which should be
245 # the collection name. We don't want to do this earlier, cos
246 # -xml arg doesn't need a collection name
247
248 if ($intArgLeftinAfterParsing != 1 )
249 {
250 &PrintUsage::print_txt_usage($options, "{import.params}", 1);
251 print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
252 die "\n";
253 }
254
255 $self->{'close_out'} = 0;
256 my $out = $self->{'out'};
257 if ($out !~ /^(STDERR|STDOUT)$/i) {
258 open (OUT, ">$out") ||
259 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
260 $out = 'inexport::OUT';
261 $self->{'close_out'} = 1;
262 }
263 $out->autoflush(1);
264 $self->{'out'} = $out;
265
266 my $statsfile = $self->{'statsfile'};
267 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
268 open (STATSFILE, ">$statsfile") ||
269 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $statsfile) && die);
270 $statsfile = 'inexport::STATSFILE';
271 $self->{'close_stats'} = 1;
272 }
273 $statsfile->autoflush(1);
274 $self->{'statsfile'} = $statsfile;
275
276 # @ARGV should be only one item, the name of the collection
277 $self->{'collection'} = shift @$argv;
278
279 # Unless otherwise stated all manifests are considered version 1---where
280 # they act more like an advanced process expression---as compared to newer
281 # manifest files that act as an explicit (and exhaustive) list of files to
282 # process [jmt12]
283 $self->{'manifest_version'} = 1;
284
285 return bless $self, $class;
286}
287
288# Simplified version of the contstructor for use with CGI scripts
289sub newCGI
290{
291 my $class = shift (@_);
292 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
293
294 my $self = { 'xml' => 0, 'mode' => $mode };
295
296 $self->{'out'} = STDERR;
297
298 if (defined $gsdl_cgi) {
299 $self->{'site'} = $opt_site;
300 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
301 $self->{'collectdir'} = $collect_dir;
302 }
303 else {
304 $self->{'site'} = "";
305 $self->{'collectdir'} = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
306 }
307 $self->{'faillog'} = "";
308
309 $self->{'collection'} = $collect;
310
311 return bless $self, $class;
312}
313sub get_collection
314{
315 my $self = shift @_;
316
317 return $self->{'collection'};
318}
319
320
321sub read_collection_cfg
322{
323 my $self = shift @_;
324 my ($collection,$options) = @_;
325
326 my $collectdir = $self->{'collectdir'};
327 my $site = $self->{'site'};
328 my $out = $self->{'out'};
329
330 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
331 #&PrintUsage::print_txt_usage($options, "{import.params}", 1);
332 die "\n";
333 }
334
335 # set gs_version 2/3
336 $self->{'gs_version'} = "2";
337 if ((defined $site) && ($site ne "")) {
338 # gs3
339 $self->{'gs_version'} = "3";
340 }
341
342 # add collection's perllib dir into include path in
343 # case we have collection specific modules
344 &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
345
346 # check that we can open the faillog
347 my $faillog = $self->{'faillog'};
348 if ($faillog eq "") {
349 $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
350 }
351 open (FAILLOG, ">$faillog") ||
352 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
353
354
355 my $faillogname = $faillog;
356 $faillog = 'inexport::FAILLOG';
357 $faillog->autoflush(1);
358 $self->{'faillog'} = $faillog;
359 $self->{'faillogname'} = $faillogname;
360 $self->{'close_faillog'} = 1;
361
362 # Read in the collection configuration file.
363 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
364 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
365
366 # store the config file's name, so oaiinfo object constructor can be instantiated with it
367 $self->{'config_filename'} = $config_filename;
368
369 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
370
371 return ($config_filename,$collectcfg);
372}
373
374sub set_collection_options
375{
376 my $self = shift @_;
377 my ($collectcfg) = @_;
378
379 my $inexport_mode = $self->{'mode'};
380
381 my $importdir = $self->{'importdir'};
382 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
383 my $out = $self->{'out'};
384
385 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
386 if (!defined($collectcfg->{'infodbtype'}))
387 {
388 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
389 }
390 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
391 # we can't use the text version for archives dbs.
392 $collectcfg->{'infodbtype'} = "gdbm";
393 }
394
395 if (defined $self->{'default_importdir'} && defined $collectcfg->{'importdir'}) {
396 $importdir = $collectcfg->{'importdir'};
397 }
398
399 if ($inexport_mode eq "import") {
400 if ( defined $self->{'default_archivedir'} && defined $collectcfg->{'archivedir'}) {
401 $archivedir = $collectcfg->{'archivedir'};
402 }
403 }
404 elsif ($inexport_mode eq "export") {
405 if (defined $self->{'default_exportdir'} && defined $collectcfg->{'exportdir'}) {
406 $archivedir = $collectcfg->{'exportdir'};
407 }
408 }
409 # fill in the default import and archives directories if none
410 # were supplied, turn all \ into / and remove trailing /
411 if (!&FileUtils::isFilenameAbsolute($importdir))
412 {
413 $importdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $importdir);
414 }
415 else
416 {
417 # Don't do this - it kills protocol prefixes
418 #$importdir =~ s/[\\\/]+/\//g;
419 #$importdir =~ s/\/$//;
420 # Do this instead
421 &FileUtils::sanitizePath($importdir);
422 }
423
424 if (!&FileUtils::directoryExists($importdir))
425 {
426 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
427 die "\n";
428 }
429 $self->{'importdir'} = $importdir;
430
431 if (!&FileUtils::isFilenameAbsolute($archivedir)) {
432 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $archivedir);
433 }
434 else {
435
436 $archivedir = &FileUtils::sanitizePath($archivedir);
437 }
438
439 my $archivedir_keepold = "${archivedir}_keepold"; # used when file-level document-version history is in play
440 $self->{'archivedir'} = $archivedir;
441 $self->{'archivedir_keepold'} = $archivedir_keepold;
442
443 if (defined $self->{'default_verbosity'}) {
444 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
445 $self->{'verbosity'} = $collectcfg->{'verbosity'};
446 }
447 }
448
449 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
450 $self->{'manifest'} = $collectcfg->{'manifest'};
451 }
452
453 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
454 if ($collectcfg->{'gzip'} =~ /^true$/i) {
455 $self->{'gzip'} = 1;
456 }
457 }
458
459 if (defined $self->{'default_maxdocs'}) {
460 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
461 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
462 }
463 }
464
465
466
467 if (defined $self->{'default_OIDtype'} ) {
468 if (defined $collectcfg->{'OIDtype'}
469 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|filename|dirname|full_filename)$/) {
470 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
471 }
472 }
473
474 if (defined $self->{'default_OIDmetadata'}) {
475 if (defined $collectcfg->{'OIDmetadata'}) {
476 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
477 }
478 }
479
480 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
481 $self->{'debug'} = 1;
482 }
483 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
484 $self->{'gli'} = 1;
485 }
486 $self->{'gli'} = 0 unless defined $self->{'gli'};
487
488 # check keepold and removeold
489 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
490
491 my ($removeold, $keepold, $replaceold, $incremental, $incremental_mode)
492 = &scriptutil::check_removeold_keepold_replaceold($self->{'removeold'}, $self->{'keepold'}, $self->{'replaceold'},
493 $self->{'incremental'}, $checkdir,
494 $collectcfg);
495
496 $self->{'removeold'} = $removeold;
497 $self->{'keepold'} = $keepold;
498 $self->{'replaceold'} = $replaceold;
499 $self->{'incremental'} = $incremental;
500 $self->{'incremental_mode'} = $incremental_mode;
501
502 # Since this wasted my morning, let's at least warn a user that manifest
503 # files now *only* work if keepold is set [jmt12]
504 if ($self->{'manifest'} && (!$keepold || !$incremental))
505 {
506 print STDERR "Warning: -manifest flag should not be specified without also setting -keepold or -incremental\n";
507 }
508}
509
510sub process_files
511{
512 my $self = shift @_;
513 my ($config_filename,$collectcfg) = @_;
514
515 my $inexport_mode = $self->{'mode'};
516
517 my $verbosity = $self->{'verbosity'};
518 my $debug = $self->{'debug'};
519
520 my $importdir = $self->{'importdir'};
521 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
522 # 'archivedir' is a tad abused, and is sometimes set to the 'exportdir' value,
523 # meaining 'archivedir_keepold' is actually the export dir name with '_keepold' appended
524 my $archivedir_keepold = $self->{'archivedir_keepold'};
525
526 my $incremental = $self->{'incremental'};
527 my $incremental_mode = $self->{'incremental_mode'};
528
529 my $gs_version = $self->{'gs_version'};
530
531 my $removeold = $self->{'removeold'};
532 my $replaceold = $self->{'replaceold'};
533 my $keepold = $self->{'keepold'};
534
535 my $saveas = $self->{'saveas'};
536 my $saveas_options = $self->{'saveas_options'};
537 my $OIDtype = $self->{'OIDtype'};
538 my $OIDmetadata = $self->{'OIDmetadata'};
539
540 my $out = $self->{'out'};
541 my $faillog = $self->{'faillog'};
542
543 my $maxdocs = $self->{'maxdocs'};
544 my $gzip = $self->{'gzip'};
545 my $groupsize = $self->{'groupsize'};
546 my $sortmeta = $self->{'sortmeta'};
547
548 my $removeprefix = $self->{'removeprefix'};
549 my $removesuffix = $self->{'removesuffix'};
550
551 my $gli = $self->{'gli'};
552
553 # related to export
554 my $xsltfile = $self->{'xsltfile'};
555 my $group_marc = $self->{'group_marc'};
556 my $mapping_file = $self->{'mapping_file'};
557 my $xslt_mets = $self->{'xslt_mets'};
558 my $xslt_txt = $self->{'xslt_txt'};
559 my $fedora_namespace = $self->{'fedora_namespace'};
560 my $metadata_prefix = $self->{'metadata_prefix'};
561
562 if ($inexport_mode eq "import") {
563 print STDERR "<Import>\n" if $gli;
564 }
565 else {
566 print STDERR "<export>\n" if $gli;
567 }
568
569 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
570 if ($self->{'manifest'} ne "") {
571 my $manifest_filename = $self->{'manifest'};
572
573 if (!&FileUtils::isFilenameAbsolute($manifest_filename)) {
574 $manifest_filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
575 }
576 $self->{'manifest'} = &FileUtils::sanitizePath($self->{'manifest'});
577 #$self->{'manifest'} =~ s/[\\\/]+/\//g;
578 #$self->{'manifest'} =~ s/\/$//;
579
580 $manifest_lookup->parse($manifest_filename);
581
582 # manifests may now include a version number [jmt12]
583 $self->{'manifest_version'} = $manifest_lookup->get_version();
584 }
585
586 my $manifest = $self->{'manifest'};
587
588 # load all the plugins
589 my $plugins = [];
590 if (defined $collectcfg->{'plugin'}) {
591 $plugins = $collectcfg->{'plugin'};
592 }
593
594 my $plugin_incr_mode = $incremental_mode;
595 if ($manifest ne "") {
596 # if we have a manifest file, then we pretend we are fully incremental for plugins
597 $plugin_incr_mode = "all";
598 }
599 #some global options for the plugins
600 my @global_opts = ();
601
602 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version, $self->{'site'});
603 if (scalar(@$pluginfo) == 0) {
604 &gsprintf($out, "{import.no_plugins_loaded}\n");
605 die "\n";
606 }
607
608 # Whether -removeold, -keepold or -replaceold there should never be an existing archivedir_keepold
609 # => Taken to be a sign of a previous import/export that has gone wrong
610 # => Print out error message and stop!
611
612 if (&FileUtils::directoryExists($archivedir_keepold)) {
613 my $rkr_old_minus_option = undef; # rkr = remove, keep, replace (whichever one is being used)
614 if ($removeold) {
615 $rkr_old_minus_option = "-removeold";
616 }
617 elsif ($keepold) {
618 $rkr_old_minus_option = "-keepold";
619 }
620 elsif ($replaceold) {
621 $rkr_old_minus_option = "-replaceold";
622 }
623
624 &gsprintf(STDERR, "\n");
625 &gsprintf(STDERR, "Detected existing directory:\n\n");
626 &gsprintf(STDERR, " $archivedir_keepold\n\n");
627 &gsprintf(STDERR, "Stopping $inexport_mode.\n\n");
628
629 &gsprintf(STDERR, "**** When building with $rkr_old_minus_option, there cannot be a pre-existing 'archives_keepold' directory\n");
630 &gsprintf(STDERR, "****\n");
631 &gsprintf(STDERR, "**** Review your collection directory folder, and determine whether to:\n");
632 &gsprintf(STDERR, "**** (a) move your 'archives_keepold' back to being 'archives'; or\n");
633 &gsprintf(STDERR, "**** (b) remove your 'archives_keepold'\n");
634 &gsprintf(STDERR, "**** before running your $inexport_mode command again\n\n");
635
636 exit 1; # c errno for 'operation not permitted'
637 }
638
639
640 # remove the old contents of the archives directory (and tmp directory) if needed
641
642 if ($removeold) {
643 if (&FileUtils::directoryExists($archivedir)) {
644 &gsprintf($out, "{import.removing_archives}\n");
645 &FileUtils::removeFilesRecursive($archivedir);
646 }
647 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
648 $tmpdir =~ s/[\\\/]+/\//g;
649 $tmpdir =~ s/\/$//;
650 if (&FileUtils::directoryExists($tmpdir)) {
651 &gsprintf($out, "{import.removing_tmpdir}\n");
652 &FileUtils::removeFilesRecursive($tmpdir);
653 }
654 }
655 else {
656 # If not $removeold, then must be $keepold or $replaceold
657 # => for either case want to "hard-link"/copy 'archives' to 'archives_keepold'
658
659 # Want to be super careful about doing this, so as not to accidentally
660 # wipe out any previous file-level document-version history
661
662 # If got to here, then there is no pre-existing $archivedir_keepold
663 # => Hard-link copy the contents of 'archives' to 'archives_keepold'
664 # => Stop if there is any issue with creating the hard-link copy
665
666 if (!&FileUtils::hardlinkFilesRefRecursive([$archivedir],$archivedir_keepold, { 'strict' => 1 } )) {
667
668 &gsprintf(STDERR, "\nError message: $!\n\n");
669
670 &gsprintf(STDERR, "**** Failed to make a hard-link copy of:\n");
671 &gsprintf(STDERR, "**** $archivedir\n");
672 &gsprintf(STDERR, "**** to:\n");
673 &gsprintf(STDERR, "**** $archivedir_keepold\n");
674 &gsprintf(STDERR, "****\n");
675 &gsprintf(STDERR, "**** Unable to proceed with file-level document-version history $inexport_mode => Stopping\n");
676
677 exit $!;
678 }
679 }
680
681 # create the archives dir if needed
682 &FileUtils::makeAllDirectories($archivedir);
683
684 # read the archive information file
685
686 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
687 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-doc"));
688 &util::rename_ldb_or_bdb_file(&FileUtils::filenameConcatenate($archivedir, "archiveinf-src"));
689
690 # When we make these initial calls to determine the archive information doc
691 # and src databases we pass through a '1' to indicate this is the first
692 # time we are referring to these databases. When using dynamic dbutils
693 # (available in extensions) this indicates to some database types (for
694 # example, persistent servers) that this is a good time to perform any
695 # one time initialization. The argument has no effect on vanilla dbutils
696 # [jmt12]
697 my $perform_firsttime_init = 1;
698 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, $perform_firsttime_init);
699 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir, $perform_firsttime_init);
700
701
702 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
703 $archive_info->load_info($arcinfo_doc_filename);
704 # Load in reverse-lookup info (used to determine the docs that a file in import are used in),
705 # so we don't overwrite existing info when we do incremental import
706 # From here on, make all changes to this object, then write out the file at the end.
707 $archive_info->load_rev_info($arcinfo_src_filename);
708
709 if ($manifest eq "") {
710 # Load in list of files in import folder from last import (if present)
711 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
712 }
713
714 ####Use Plugout####
715 my $plugout;
716
717 my $generate_auxiliary_files = 0;
718 if ($inexport_mode eq "import") {
719 $generate_auxiliary_files = 1;
720 }
721 elsif ($self->{'include_auxiliary_database_files'}) {
722 $generate_auxiliary_files = 1;
723 }
724 $self->{'generate_auxiliary_files'} = $generate_auxiliary_files;
725
726 # Option to use user defined plugout
727 if ($inexport_mode eq "import") {
728 if (defined $collectcfg->{'plugout'}) {
729 # If a plugout was specified in the collect.cfg file, assume it is sensible
730 # We can't check the name because it could be anything, if it is a custom plugout
731 print STDERR "Using plugout specified in collect.cfg: ".join(' ', @{$collectcfg->{'plugout'}})."\n";
732 $plugout = $collectcfg->{'plugout'};
733 }
734 else {
735 push @$plugout,$saveas."Plugout";
736 }
737
738 }
739 else {
740 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(GreenstoneXML|.*METS|DSpace|MARCXML)Plugout/) {
741 $plugout = $collectcfg->{'plugout'};
742 print STDERR "Using plugout specified in collect.cfg: $collectcfg->{'plugout'}\n";
743 }
744 else {
745 push @$plugout,$saveas."Plugout";
746 }
747 }
748
749 my $plugout_name = $plugout->[0];
750
751 if (defined $saveas_options) {
752 my @user_plugout_options = split(" ", $saveas_options);
753 push @$plugout, @user_plugout_options;
754 }
755 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
756 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
757 push @$plugout,("-debug") if ($debug);
758 push @$plugout,("-gzip_output") if ($gzip);
759 push @$plugout,("-output_handle",$out) if (defined $out);
760 push @$plugout,("-site",$self->{'site'}) if (defined $self->{'site'});
761
762 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
763 push @$plugout, ("-no_auxiliary_databases") if ($generate_auxiliary_files == 0);
764 if ($inexport_mode eq "import") {
765 if ($plugout_name =~ m/^GreenstoneXMLPlugout$/) {
766 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
767 }
768 }
769 my $processor = &plugout::load_plugout($plugout);
770 $processor->setoutputdir ($archivedir);
771 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
772 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
773 $processor->begin();
774 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
775
776 if ($removeold) {
777 # occasionally, plugins may want to do something on remove
778 # old, eg pharos image indexing
779 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
780 }
781
782 # process the import directory
783 my $block_hash = {};
784 $block_hash->{'new_files'} = {};
785 $block_hash->{'reindex_files'} = {};
786
787 # all of these are set somewhere else, so it's more readable to define them here [jmt12]
788 $block_hash->{'all_files'} = {};
789 $block_hash->{'deleted_files'} = {};
790 $block_hash->{'file_blocks'} = {};
791 $block_hash->{'metadata_files'} = {};
792 $block_hash->{'shared_fileroot'} = '';
793 $block_hash->{'manifest'} = 'false';
794 my $metadata = {};
795
796 # global blocking pass may set up some metadata
797 # does this set up metadata?????
798 # - when we have a newer manifest file we don't do this -unless- the
799 # collection configuration indicates this collection contains complex
800 # (inherited) metadata [jmt12]
801 if ($manifest eq '' || (defined $collectcfg->{'complexmeta'} && $collectcfg->{'complexmeta'} eq 'true'))
802 {
803 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
804 }
805 else
806 {
807 print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
808 }
809
810
811 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
812 # of the OAI identifiers with their time stamps and deleted status.
813 my $oai_info = new oaiinfo($self->{'config_filename'}, $collectcfg->{'infodbtype'}, $verbosity);
814 my $have_manifest = ($manifest eq '') ? 0 : 1;
815 $oai_info->import_stage($removeold, $have_manifest);
816
817
818 if ($manifest ne "") {
819
820 # mark that we are using a manifest - information that might be needed
821 # down in plugins (for instance DirectoryPlugin)
822 $block_hash->{'manifest'} = $self->{'manifest_version'};
823
824 #
825 # 1. Process delete files first
826 #
827 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
828 my @full_deleted_files = ();
829
830 # ensure all filenames are absolute
831 foreach my $df (@deleted_files) {
832 my $full_df =
833 (&FileUtils::isFilenameAbsolute($df))
834 ? $df
835 : &FileUtils::filenameConcatenate($importdir,$df);
836
837 # gdb doesn't store short filenames, so ensure we specify full filenames for deletion
838 $full_df = &util::upgrade_if_dos_filename($full_df); # will only do something on windows
839
840 if (-d $full_df) {
841 &add_dir_contents_to_list($full_df, \@full_deleted_files);
842 } else {
843 push(@full_deleted_files,$full_df);
844 }
845 }
846
847 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
848 mark_docs_for_deletion($archive_info,{},
849 \@full_deleted_files,
850 $archivedir, $verbosity, "delete");
851
852
853 #
854 # 2. Now files for reindexing
855 #
856
857 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
858 my @full_reindex_files = ();
859 # ensure all filenames are absolute
860 foreach my $rf (@reindex_files) {
861 my $full_rf =
862 (&FileUtils::isFilenameAbsolute($rf))
863 ? $rf
864 : &FileUtils::filenameConcatenate($importdir,$rf);
865
866 if (-d $full_rf) {
867 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
868 } else {
869 push(@full_reindex_files,$full_rf);
870 }
871 }
872
873 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
874 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
875
876 # And now to ensure the new version of the file processed by
877 # appropriate plugin, we need to add it to block_hash reindex list
878 foreach my $full_rf (@full_reindex_files) {
879 $block_hash->{'reindex_files'}->{$full_rf} = 1;
880 }
881
882
883 #
884 # 3. Now finally any new files - add to block_hash new_files list
885 #
886
887 my @new_files = keys %{$manifest_lookup->{'index'}};
888 my @full_new_files = ();
889
890 foreach my $nf (@new_files) {
891 # ensure filename is absolute
892 my $full_nf =
893 (&FileUtils::isFilenameAbsolute($nf))
894 ? $nf
895 : &FileUtils::filenameConcatenate($importdir,$nf);
896
897 if (-d $full_nf) {
898 &add_dir_contents_to_list($full_nf, \@full_new_files);
899 } else {
900 push(@full_new_files,$full_nf);
901 }
902 }
903
904 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
905
906 # need to check this file exists before trying to read it - in the past
907 # it wasn't possible to have a manifest unless keepold was also set so
908 # you were pretty much guaranteed arcinfo existed
909 # [jmt12]
910 # @todo &FileUtils::fileExists($arcinfo_src_filename) [jmt12]
911 if (-e $arcinfo_src_filename)
912 {
913 my $arcinfodb_map = {};
914 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
915 foreach my $f (@full_new_files) {
916 my $rel_f = &util::abspath_to_placeholders($f);
917
918 # check that we haven't seen it already
919 if (defined $arcinfodb_map->{$rel_f}) {
920 # TODO make better warning
921 print STDERR "Warning: $f ($rel_f) already in src archive, \n";
922 } else {
923 $block_hash->{'new_files'}->{$f} = 1;
924 }
925 }
926
927 undef $arcinfodb_map;
928 }
929 # no existing files - so we can just add all the files [jmt12]
930 else
931 {
932 foreach my $f (@full_new_files)
933 {
934 $block_hash->{'new_files'}->{$f} = 1;
935 }
936 }
937
938 # If we are not using complex inherited metadata (and thus have skipped
939 # the global file scan) we need to at least check for a matching
940 # metadata.xml for the files being indexed/reindexed
941 # - unless we are using the newer version of Manifests, which are treated
942 # verbatim, and should have a metadata element for metadata files (so
943 # we can explicitly process metadata files other than metadata.xml)
944 # [jmt12]
945 if ($self->{'manifest_version'} == 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
946 {
947 my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
948 foreach my $file_to_import (@all_files_to_import)
949 {
950 my $metadata_xml_path = $file_to_import;
951 $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
952 if (&FileUtils::fileExists($metadata_xml_path))
953 {
954 &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
955 }
956 }
957 }
958
959 # new version manifest files explicitly list metadata files to be
960 # processed (ignoring complexmeta if set)
961 # [jmt12]
962 if ($self->{'manifest_version'} > 1)
963 {
964 # Process metadata files
965 foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
966 {
967 $self->perform_process_files($manifest, $pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
968 }
969 }
970 } # end if (manifest ne "")
971 else {
972 # if incremental, we read through the import folder to see whats changed.
973
974 if ($incremental || $incremental_mode eq "onlyadd") {
975 prime_doc_oid_count($archivedir);
976
977 # Can now work out which files were new, already existed, and have
978 # been deleted
979
980 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
981 $archivedir,$verbosity,$incremental_mode);
982
983 my @new_files = sort keys %{$block_hash->{'new_files'}};
984 if (scalar(@new_files>0)) {
985 print STDERR "New files and modified metadata files since last import:\n ";
986 print STDERR join("\n ",@new_files), "\n";
987 }
988
989 if ($incremental) {
990 # only look for deletions if we are truely incremental
991 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
992 # Filter out any in gsdl/tmp area
993 my @filtered_deleted_files = ();
994 my $gsdl_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
995 my $collect_tmp_area = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
996 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
997 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
998
999 foreach my $df (@deleted_files) {
1000 next if ($df =~ m/^$gsdl_tmp_area/);
1001 next if ($df =~ m/^$collect_tmp_area/);
1002
1003 push(@filtered_deleted_files,$df);
1004 }
1005
1006
1007 @deleted_files = @filtered_deleted_files;
1008
1009 if (scalar(@deleted_files)>0) {
1010 print STDERR "Files deleted since last import:\n ";
1011 print STDERR join("\n ",@deleted_files), "\n";
1012
1013
1014 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
1015
1016 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
1017 }
1018
1019 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
1020
1021 if (scalar(@reindex_files)>0) {
1022 print STDERR "Files to reindex since last import:\n ";
1023 print STDERR join("\n ",@reindex_files), "\n";
1024 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
1025 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
1026 }
1027
1028 }
1029 } # end if incremental/only_add mode
1030 # else no manifest AND not incremental
1031 } # end if else block of manifest ne "" else eq ""
1032
1033 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
1034 # Do nothing if the file already exists (file exists on incremental build).
1035 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
1036 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
1037 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
1038 # oailastmodified and oailastmodifieddate
1039 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
1040 if ($self->{'generate_auxiliary_files'}) {
1041 if (!-f $earliestDatestampFile && -d $archivedir) {
1042 my $current_time_in_seconds = time; # in seconds
1043
1044 if(open(FOUT, ">$earliestDatestampFile")) {
1045 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
1046 print FOUT $current_time_in_seconds;
1047 close(FOUT);
1048 }
1049 else {
1050 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
1051 }
1052
1053 }
1054 }
1055
1056 $self->perform_process_files($manifest, $pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs);
1057
1058 if ($saveas eq "FedoraMETS") {
1059 # create collection "doc obj" for Fedora that contains
1060 # collection-level metadata
1061
1062 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
1063 $doc_obj->set_OID("collection");
1064
1065 my $col_name = undef;
1066 my $col_meta = $collectcfg->{'collectionmeta'};
1067
1068 if (defined $col_meta) {
1069 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
1070 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
1071 }
1072 $processor->process($doc_obj);
1073 }
1074
1075 &plugin::end($pluginfo, $processor);
1076
1077 &plugin::deinit($pluginfo, $processor);
1078
1079 # Store the value of OIDCount (used in doc.pm) so it can be
1080 # restored correctly to this value on an incremental build
1081 # - this OIDcount file should only be generated for numerical oids [jmt12]
1082 if ($self->{'OIDtype'} eq 'incremental')
1083 {
1084 store_doc_oid_count($archivedir);
1085 }
1086
1087 # signal to the processor (plugout) that we have finished processing - if we are group processing, then the final output file needs closing.
1088 $processor->close_group_output() if $processor->is_group();
1089 $processor->end();
1090
1091 if ($self->{'generate_auxiliary_files'}) {
1092
1093 # write out the archive information file
1094 # for backwards compatability with archvies.inf file
1095 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
1096 # In the days of this being a text file, this all we had to do
1097 # Note, if still using this form of archive-inf, then neither
1098 # incremental building nor files-level document-version history
1099 # is suported
1100 $archive_info->save_info($arcinfo_doc_filename);
1101 }
1102 else {
1103 $archive_info->save_revinfo_db($arcinfo_src_filename);
1104 }
1105 }
1106
1107
1108 #
1109 # Now deal with any file-level document-version history (fldv-history)
1110 #
1111
1112 if ($keepold || $removeold) {
1113
1114 &DocHistoryFileUtils::archivedir_keepold_to_archivedir($collectcfg, $keepold, $replaceold, $incremental_mode, $archive_info, $archivedir,$archivedir_keepold);
1115
1116 }
1117
1118
1119 return $pluginfo;
1120}
1121
1122# @function perform_process_files()
1123# while process_files() above prepares the system to import files this is the
1124# function that actually initiates the plugin pipeline to process the files.
1125# This function should therefore be overridden in subclasses of inexport.pm should
1126# they wish to do different or further processing
1127# @author jmt12
1128sub perform_process_files
1129{
1130 my $self = shift(@_);
1131 my ($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs) = @_;
1132 my $gli = $self->{'gli'};
1133 # specific file to process - via manifest version 2+
1134 if ($file_to_import ne '')
1135 {
1136 &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1137 }
1138 # global file scan - if we are using a new version manifest, files would have
1139 # been read above. Older manifests use extra settings in the $block_hash to
1140 # control what is imported, while non-manifest imports use a regular
1141 # $block_hash (so obeying process_exp and block_exp) [jmt12]
1142 elsif ($manifest eq '' || $self->{'manifest_version'} == 1)
1143 {
1144 #print STDERR "**** perform_process_files(): importdir=$importdir\n";
1145 #print STDERR "**** block_hash:\n ", join("\n ", keys %{$block_hash}), "\n\n";
1146 #print STDERR "**** block_hash->all_files:\n ", join("\n ", keys %{$block_hash->{'all_files'}}), "\n\n";
1147 #print STDERR "**** block_hash->reindex_files:\n ", join("\n ", keys %{$block_hash->{'reindex_files'}}), "\n\n";
1148
1149 #print STDERR "**** block_hash->existing_files:\n ", join("\n ", keys %{$block_hash->{'existing_files'}}), "\n\n";
1150 #print STDERR "**** block_hash->file_blocks:\n ", join("\n ", keys %{$block_hash->{'file_blocks'}}), "\n\n";
1151
1152 &plugin::read ($pluginfo, $importdir, '', $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
1153 }
1154 else
1155 {
1156 print STDERR "Skipping perform_process_files() due to manifest presence and version\n";
1157 }
1158}
1159# perform_process_files()
1160
1161# @function generate_statistics()
1162sub generate_statistics
1163{
1164 my $self = shift @_;
1165 my ($pluginfo) = @_;
1166
1167 my $inexport_mode = $self->{'mode'};
1168 my $out = $self->{'out'};
1169 my $faillogname = $self->{'faillogname'};
1170 my $statsfile = $self->{'statsfile'};
1171 my $gli = $self->{'gli'};
1172
1173 &gsprintf($out, "\n");
1174 &gsprintf($out, "*********************************************\n");
1175 &gsprintf($out, "{$inexport_mode.complete}\n");
1176 &gsprintf($out, "*********************************************\n");
1177
1178 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
1179}
1180# generate_statistics()
1181
1182
1183# @function deinit()
1184# Close down any file handles that we opened (and hence are responsible for
1185# closing
1186sub deinit
1187{
1188 my $self = shift(@_);
1189 close OUT if $self->{'close_out'};
1190 close FAILLOG if $self->{'close_faillog'};
1191 close STATSFILE if $self->{'close_statsfile'};
1192}
1193# deinit()
1194
1195
1196sub store_collectionmeta
1197{
1198 my ($collectionmeta,$field,$doc_obj) = @_;
1199
1200 my $section = $doc_obj->get_top_section();
1201
1202 my $field_hash = $collectionmeta->{$field};
1203
1204 foreach my $k (keys %$field_hash)
1205 {
1206 my $val = $field_hash->{$k};
1207
1208 ### print STDERR "*** $k = $field_hash->{$k}\n";
1209
1210 my $md_label = "ex.$field";
1211
1212
1213 if ($k =~ m/^\[l=(.*?)\]$/)
1214 {
1215
1216 my $md_suffix = $1;
1217 $md_label .= "^$md_suffix";
1218 }
1219
1220
1221 $doc_obj->add_utf8_metadata($section,$md_label, $val);
1222
1223 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
1224 # while "collectionname" in GS2 is called "name" in GS3.
1225 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
1226 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
1227 {
1228 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
1229 }
1230
1231 }
1232}
1233
1234
1235sub oid_count_file {
1236 my ($archivedir) = @_;
1237 return &FileUtils::filenameConcatenate($archivedir, "OIDcount");
1238}
1239
1240
1241sub prime_doc_oid_count
1242{
1243 my ($archivedir) = @_;
1244 my $oid_count_filename = &oid_count_file($archivedir);
1245
1246 if (-e $oid_count_filename) {
1247 if (open(OIDIN,"<$oid_count_filename")) {
1248 my $OIDcount = <OIDIN>;
1249 chomp $OIDcount;
1250 close(OIDIN);
1251
1252 $doc::OIDcount = $OIDcount;
1253 }
1254 else {
1255 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
1256 }
1257 }
1258
1259}
1260
1261sub store_doc_oid_count
1262{
1263 # Use the file "OIDcount" in the archives directory to record
1264 # what value doc.pm got up to
1265
1266 my ($archivedir) = @_;
1267 my $oid_count_filename = &oid_count_file($archivedir);
1268
1269 # @todo $oidout = &FileUtils::openFileDescriptor($oid_count_filename, 'w') [jmt12]
1270 if (open(OIDOUT,">$oid_count_filename")) {
1271 print OIDOUT $doc::OIDcount, "\n";
1272
1273 close(OIDOUT);
1274 }
1275 else {
1276 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
1277 }
1278}
1279
1280
1281
1282sub new_vs_old_import_diff
1283{
1284 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
1285
1286 # Get the infodbtype value for this collection from the arcinfo object
1287 my $infodbtype = $archive_info->{'infodbtype'};
1288
1289 # in this method, we want to know if metadata files are modified or not.
1290 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1291
1292 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
1293
1294 # First convert all files to absolute form
1295 # This is to support the situation where the import folder is not
1296 # the default
1297
1298 my $prev_all_files = $archive_info->{'prev_import_filelist'};
1299
1300 my $full_prev_all_files = {};
1301
1302 foreach my $prev_file (keys %$prev_all_files) {
1303 # arcinfo deals in real filenames ie windows short names. but the block hash stuff is all full long versions.
1304 $prev_file = &util::upgrade_if_dos_filename($prev_file);
1305
1306 if (!&FileUtils::isFilenameAbsolute($prev_file)) {
1307 my $full_prev_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$prev_file);
1308 $full_prev_all_files->{$full_prev_file} = $prev_file;
1309 }
1310 else {
1311 $full_prev_all_files->{$prev_file} = $prev_file;
1312 }
1313 }
1314
1315
1316 # Figure out which are the new files, existing files and so
1317 # by implication the files from the previous import that are not
1318 # there any more => mark them for deletion
1319 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
1320
1321 my $full_curr_file = $curr_file;
1322
1323 # entry in 'all_files' is moved to either 'existing_files',
1324 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
1325
1326 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1327 # add in import dir to make absolute
1328 $full_curr_file = &FileUtils::filenameConcatenate($importdir,$curr_file);
1329 }
1330
1331 ###print STDERR "*** new vs old: look to see if full_curr_file=$full_curr_file in full_prev_all_files hashmap\n";
1332
1333 # figure out if new file or not
1334 if (defined $full_prev_all_files->{$full_curr_file}) {
1335 # delete it so that only files that need deleting are left
1336 delete $full_prev_all_files->{$full_curr_file};
1337 # had it before. is it a metadata file?
1338 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1339 # is it modified??
1340 if (-M $full_curr_file < $archiveinf_timestamp) {
1341 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1342 # its newer than last build
1343 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1344 }
1345 }
1346 else {
1347 if ($incremental_mode eq "all") {
1348
1349 # had it before
1350 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1351
1352 }
1353 else {
1354 # Warning in "onlyadd" mode, but had it before!
1355 print STDERR "Warning: File $full_curr_file previously imported.\n";
1356 print STDERR " Treating as new file\n";
1357
1358 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1359
1360 }
1361 }
1362 }
1363 else {
1364 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1365 # the new file is the special sort of file greenstone uses
1366 # to attach metadata to src documents
1367 # i.e metadata.xml
1368 # (but note, the filename used is not constrained in
1369 # Greenstone to always be this)
1370
1371 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1372 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1373 }
1374 else {
1375 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1376 }
1377 }
1378
1379
1380 delete $block_hash->{'all_files'}->{$curr_file};
1381 }
1382
1383
1384
1385
1386 # Deal with complication of new or modified metadata files by forcing
1387 # everything from this point down in the file hierarchy to
1388 # be freshly imported.
1389 #
1390 # This may mean files that have not changed are reindexed, but does
1391 # guarantee by the end of processing all new metadata is correctly
1392 # associated with the relevant document(s).
1393
1394 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1395 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1396
1397 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1398 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1399
1400 # Go through existing_files, and mark anything that is contained
1401 # within 'situated_dir' to be reindexed (in case some of the metadata
1402 # attaches to one of these files)
1403
1404 my $reindex_files = [];
1405
1406 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1407
1408 if ($existing_f =~ m/^$situated_dir/) {
1409
1410 # print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1411
1412 push(@$reindex_files,$existing_f);
1413 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1414 delete $block_hash->{'existing_files'}->{$existing_f};
1415
1416 }
1417 }
1418
1419 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1420 # (or equivalent)
1421 $block_hash->{'new_files'}->{$new_mdf} = 1;
1422
1423 }
1424
1425 # go through remaining existing files and work out what has changed and needs to be reindexed.
1426 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1427
1428 my $reindex_files = [];
1429
1430 foreach my $existing_filename (@existing_files) {
1431 if (-M $existing_filename < $archiveinf_timestamp) {
1432 # file is newer than last build
1433
1434 my $existing_file = $existing_filename;
1435 #my $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'});
1436
1437 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1438 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1439
1440 # print STDERR "**** Reindexing existing file: $existing_file\n";
1441
1442 push(@$reindex_files,$existing_file);
1443 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1444 }
1445
1446 }
1447
1448
1449 # By this point full_prev_all_files contains the files
1450 # mentioned in archiveinf-src.db but are not in the 'import'
1451 # folder (or whatever was specified through -importdir ...)
1452
1453 # This list can contain files that were created in the 'tmp' or
1454 # 'cache' areas (such as screen-size and thumbnail images).
1455 #
1456 # In building the final list of files to delete, we test to see if
1457 # it exists on the filesystem and if it does (unusual for a "normal"
1458 # file in import, but possible in the case of 'tmp' files),
1459 # supress it from going into the final list
1460
1461 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1462
1463 my @deleted_files = values %$full_prev_all_files;
1464 map { my $curr_file = $_;
1465 my $full_curr_file = $curr_file;
1466
1467 if (!&FileUtils::isFilenameAbsolute($curr_file)) {
1468 # add in import dir to make absolute
1469
1470 $full_curr_file = &FileUtils::filenameConcatenate($collectdir,$curr_file);
1471 }
1472
1473
1474 if (!-e $full_curr_file) {
1475 $curr_file = &util::upgrade_if_dos_filename($curr_file);
1476 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1477 }
1478 } @deleted_files;
1479
1480
1481
1482}
1483
1484
1485# this is used to delete "deleted" docs and to remove old versions of "changed" docs
1486# $mode is 'delete' or 'reindex'
1487sub mark_docs_for_deletion
1488{
1489 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1490
1491 my $mode_text = "deleted from index";
1492 if ($mode eq "reindex") {
1493 $mode_text = "reindexed";
1494 }
1495
1496 # Get the infodbtype value for this collection from the arcinfo object
1497 my $infodbtype = $archive_info->{'infodbtype'};
1498
1499 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1500 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1501
1502
1503 # record files marked for deletion in arcinfo
1504 foreach my $file (@$deleted_files) {
1505 # use 'archiveinf-src' info database file to look up all the OIDs
1506 # that this file is used in (note in most cases, it's just one OID)
1507
1508 my $downgraded_file = &util::downgrade_if_dos_filename($file);
1509 my $oids = $archive_info->get_reverseinfo($downgraded_file);
1510 $archive_info->remove_reverseinfo($downgraded_file);
1511
1512 foreach my $oid (@$oids) {
1513 # get the record for this OID from doc db
1514 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1515 # find the source doc (the primary file that becomes this oid)
1516 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1517 $doc_source_file = &util::placeholders_to_abspath($doc_source_file, "long");
1518
1519 if (!&FileUtils::isFilenameAbsolute($doc_source_file)) {
1520 $doc_source_file = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1521 }
1522
1523 if ($doc_source_file ne $file) {
1524 # its an associated or metadata file
1525 # mark source doc for reimport as one of its assoc files has changed or deleted
1526 #$doc_source_file = &util::upgrade_if_dos_filename($doc_source_file);
1527 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1528
1529 } else {
1530
1531 # the file to be deleted/reindexed is a primary file. We need to remove all references to this in the src db
1532 my $assoc_files = $doc_rec->{'assoc-file'};
1533 foreach my $assocfile (@$assoc_files) {
1534 $assocfile = &util::placeholders_to_abspath($assocfile);
1535 $archive_info->remove_reverseinfo($assocfile, $oid);
1536 if (!defined $archive_info->get_reverseinfo($assocfile)) {
1537 # nothing refers to it anymore, mark for reindex.
1538 # block hash needs full filenames
1539 $assocfile = &util::upgrade_if_dos_filename($assocfile);
1540 $block_hash->{'reindex_files'}->{$assocfile} = 1;
1541 }
1542 }
1543
1544 }
1545 my $curr_status = $archive_info->get_status_info($oid);
1546 if (defined($curr_status) && (($curr_status ne "D"))) {
1547 if ($verbosity>1) {
1548 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1549 }
1550 # mark oid for deletion (it will be deleted or reimported)
1551 $archive_info->set_status_info($oid,"D");
1552 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1553 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1554
1555 my $val_rec = &dbutil::convert_infodb_string_to_hash($infodbtype,$val);
1556 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1557
1558 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1559 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1560 }
1561 }
1562
1563 }
1564
1565 # now go through and check that we haven't marked any primary
1566 # files for reindex (because their associated files have
1567 # changed/deleted) when they have been deleted themselves. only in
1568 # delete mode.
1569
1570 if ($mode eq "delete") {
1571 foreach my $file (@$deleted_files) {
1572 if (defined $block_hash->{'reindex_files'}->{$file}) {
1573 delete $block_hash->{'reindex_files'}->{$file};
1574 }
1575 }
1576 }
1577
1578
1579}
1580
1581sub add_dir_contents_to_list {
1582
1583 my ($dirname, $list) = @_;
1584
1585 # Recur over directory contents.
1586 my (@dir, $subfile);
1587
1588 # find all the files in the directory
1589 if (!opendir (DIR, $dirname)) {
1590 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1591 return -1; # error in processing
1592 }
1593 @dir = readdir (DIR);
1594 closedir (DIR);
1595
1596 for (my $i = 0; $i < scalar(@dir); $i++) {
1597 my $subfile = $dir[$i];
1598 next if ($subfile =~ m/^\.\.?$/);
1599 next if ($subfile =~ /^\.svn$/);
1600 my $full_file = &FileUtils::filenameConcatenate($dirname, $subfile);
1601 if (-d $full_file) {
1602 &add_dir_contents_to_list($full_file, $list);
1603 } else {
1604 push (@$list, $full_file);
1605 }
1606 }
1607
1608}
1609
1610
16111;
Note: See TracBrowser for help on using the repository browser.