source: main/trunk/greenstone2/perllib/inexport.pm@ 27220

Last change on this file since 27220 was 26567, checked in by ak19, 11 years ago

When a GS2 collection contains both collect.cfg and collectionConfig.xml (as advanced beatles does) the old code used to end up reading in the GS3 collectionConfig.xml instead of the GS2 collect.cfg and set the GS_mode to GS3. Now colcfg::get_collect_cfg_name takes the gs_mode (instead of determining this and returning it) and works out the collectcfg file name for the gs_mode. That means that the calling functions now need to work out the gs_mode. They do so by setting the gs_mode to gs3 if the site flag is present in the commandline, if not then it defaults to gs2. So from now on, the site flag must be specified for GS3 collections.

  • Property svn:executable set to *
File size: 38.7 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # set gs_version 2/3
176 $self->{'gs_version'} = "2";
177 if ((defined $site) && ($site ne "")) {
178 # gs3
179 $self->{'gs_version'} = "3";
180 }
181 # add collection's perllib dir into include path in
182 # case we have collection specific modules
183 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
184
185 # check that we can open the faillog
186 my $faillog = $self->{'faillog'};
187 if ($faillog eq "") {
188 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
189 }
190 open (FAILLOG, ">$faillog") ||
191 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
192
193
194 my $faillogname = $faillog;
195 $faillog = 'inexport::FAILLOG';
196 $faillog->autoflush(1);
197 $self->{'faillog'} = $faillog;
198 $self->{'faillogname'} = $faillogname;
199
200 # Read in the collection configuration file.
201 my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
202 my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
203 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
204
205 return ($config_filename,$collectcfg);
206}
207
208sub set_collection_options
209{
210 my $self = shift @_;
211 my ($collectcfg) = @_;
212
213 my $inexport_mode = $self->{'mode'};
214
215 my $verbosity = $self->{'verbosity'};
216 my $debug = $self->{'debug'};
217 my $importdir = $self->{'importdir'};
218 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
219 my $out = $self->{'out'};
220
221 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
222 if (!defined($collectcfg->{'infodbtype'}))
223 {
224 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
225 }
226 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
227 # we can't use the text version for archives dbs.
228 $collectcfg->{'infodbtype'} = "gdbm";
229 }
230
231 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
232 $importdir = $collectcfg->{'importdir'};
233 }
234 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
235 $archivedir = $collectcfg->{'archivedir'};
236 }
237 # fill in the default import and archives directories if none
238 # were supplied, turn all \ into / and remove trailing /
239 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
240 $importdir =~ s/[\\\/]+/\//g;
241 $importdir =~ s/\/$//;
242 if (!-e $importdir) {
243 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
244 die "\n";
245 }
246 $self->{'importdir'} = $importdir;
247
248 if ($archivedir eq "") {
249 if ($inexport_mode eq "import") {
250 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
251 }
252 elsif ($inexport_mode eq "export") {
253 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
254 }
255 else {
256 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
257 print STDERR " Defaulting to 'archives' for file output\n";
258 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
259 }
260 }
261
262 $archivedir =~ s/[\\\/]+/\//g;
263 $archivedir =~ s/\/$//;
264 $self->{'archivedir'} = $archivedir;
265
266 if ($verbosity !~ /\d+/) {
267 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
268 $verbosity = $collectcfg->{'verbosity'};
269 } else {
270 $verbosity = 2; # the default
271 }
272 }
273 $self->{'verbosity'} = $verbosity;
274
275 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
276 $self->{'manifest'} = $collectcfg->{'manifest'};
277 }
278
279 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
280 if ($collectcfg->{'gzip'} =~ /^true$/i) {
281 $self->{'gzip'} = 1;
282 }
283 }
284
285 if ($self->{'maxdocs'} !~ /\-?\d+/) {
286 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
287 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
288 } else {
289 $self->{'maxdocs'} = -1; # the default
290 }
291 }
292
293 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
294 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
295 $self->{'groupsize'} = $collectcfg->{'groupsize'};
296 }
297 }
298
299 if (!defined $self->{'OIDtype'}
300 || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/ )) {
301 # OIDtype was either not defined on the command-line, or if it was not one of the recognized values
302 if (defined $collectcfg->{'OIDtype'}
303 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/) {
304 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
305 } else {
306 $self->{'OIDtype'} = "hash"; # the default
307 }
308 }
309
310 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
311 if (defined $collectcfg->{'OIDmetadata'}) {
312 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
313 } else {
314 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
315 }
316 }
317
318 my $sortmeta = $self->{'sortmeta'};
319 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
320 $sortmeta = $collectcfg->{'sortmeta'};
321 }
322 # sortmeta cannot be used with group size
323 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
324 if (defined $sortmeta && $self->{'groupsize'} > 1) {
325 &gsprintf($out, "{import.cannot_sort}\n\n");
326 $sortmeta = undef;
327 }
328 $self->{'sortmeta'} = $sortmeta;
329
330 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
331 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
332 }
333
334 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
335 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
336 }
337 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
338 $self->{'debug'} = 1;
339 }
340 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
341 $self->{'gli'} = 1;
342 }
343 $self->{'gli'} = 0 unless defined $self->{'gli'};
344
345 # check keepold and removeold
346 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
347
348 my ($removeold, $keepold, $incremental, $incremental_mode)
349 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
350 $self->{'incremental'}, $checkdir,
351 $collectcfg);
352
353 $self->{'removeold'} = $removeold;
354 $self->{'keepold'} = $keepold;
355 $self->{'incremental'} = $incremental;
356 $self->{'incremental_mode'} = $incremental_mode;
357}
358
359sub process_files
360{
361 my $self = shift @_;
362 my ($config_filename,$collectcfg) = @_;
363
364 my $inexport_mode = $self->{'mode'};
365
366 my $verbosity = $self->{'verbosity'};
367 my $debug = $self->{'debug'};
368
369 my $importdir = $self->{'importdir'};
370 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
371
372 my $incremental = $self->{'incremental'};
373 my $incremental_mode = $self->{'incremental_mode'};
374
375 my $gs_version = $self->{'gs_version'};
376
377 my $removeold = $self->{'removeold'};
378 my $keepold = $self->{'keepold'};
379
380 my $saveas = $self->{'saveas'};
381 my $OIDtype = $self->{'OIDtype'};
382 my $OIDmetadata = $self->{'OIDmetadata'};
383
384 my $out = $self->{'out'};
385 my $faillog = $self->{'faillog'};
386
387 my $maxdocs = $self->{'maxdocs'};
388 my $gzip = $self->{'gzip'};
389 my $groupsize = $self->{'groupsize'};
390 my $sortmeta = $self->{'sortmeta'};
391
392 my $removeprefix = $self->{'removeprefix'};
393 my $removesuffix = $self->{'removesuffix'};
394
395 my $gli = $self->{'gli'};
396
397 my $jobs = $self->{'jobs'};
398 my $epoch = $self->{'epoch'};
399
400 # related to export
401 my $xsltfile = $self->{'xsltfile'};
402 my $group_marc = $self->{'group_marc'};
403 my $mapping_file = $self->{'mapping_file'};
404 my $xslt_mets = $self->{'xslt_mets'};
405 my $xslt_txt = $self->{'xslt_txt'};
406 my $fedora_namespace = $self->{'fedora_namespace'};
407 my $metadata_prefix = $self->{'metadata_prefix'};
408
409 if ($inexport_mode eq "import") {
410 print STDERR "<Import>\n" if $gli;
411 }
412 else {
413 print STDERR "<export>\n" if $gli;
414 }
415
416 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
417 if ($self->{'manifest'} ne "") {
418 my $manifest_filename = $self->{'manifest'};
419
420 if (!&util::filename_is_absolute($manifest_filename)) {
421 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
422 }
423
424 $self->{'manifest'} =~ s/[\\\/]+/\//g;
425 $self->{'manifest'} =~ s/\/$//;
426
427 $manifest_lookup->parse($manifest_filename);
428 }
429
430 my $manifest = $self->{'manifest'};
431
432 # load all the plugins
433 my $plugins = [];
434 if (defined $collectcfg->{'plugin'}) {
435 $plugins = $collectcfg->{'plugin'};
436 }
437
438 my $plugin_incr_mode = $incremental_mode;
439 if ($manifest ne "") {
440 # if we have a manifest file, then we pretend we are fully incremental for plugins
441 $plugin_incr_mode = "all";
442 }
443 #some global options for the plugins
444 my @global_opts = ();
445
446 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
447 if (scalar(@$pluginfo) == 0) {
448 &gsprintf($out, "{import.no_plugins_loaded}\n");
449 die "\n";
450 }
451
452 # remove the old contents of the archives directory (and tmp
453 # directory) if needed
454
455 if ($removeold) {
456 if (-e $archivedir) {
457 &gsprintf($out, "{import.removing_archives}\n");
458 &util::rm_r ($archivedir);
459 }
460 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
461 $tmpdir =~ s/[\\\/]+/\//g;
462 $tmpdir =~ s/\/$//;
463 if (-e $tmpdir) {
464 &gsprintf($out, "{import.removing_tmpdir}\n");
465 &util::rm_r ($tmpdir);
466 }
467 }
468
469 # create the archives dir if needed
470 &util::mk_all_dir($archivedir);
471
472 # read the archive information file
473
474 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
475 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
476 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
477
478 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
479 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
480
481 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
482 $archive_info->load_info ($arcinfo_doc_filename);
483
484 if ($manifest eq "") {
485 # Load in list of files in import folder from last import (if present)
486 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
487 }
488
489 ####Use Plugout####
490 my $plugout;
491
492 if ($inexport_mode eq "import") {
493 if (defined $collectcfg->{'plugout'}) {
494 # If a plugout was specified in the collect.cfg file, assume it is sensible
495 # We can't check the name because it could be anything, if it is a custom plugout
496 $plugout = $collectcfg->{'plugout'};
497 }
498 else{
499 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
500 push @$plugout,"GreenstoneXMLPlugout";
501 }
502 else{
503 push @$plugout,$saveas."Plugout";
504 }
505 }
506 }
507 else {
508 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
509 $plugout = $collectcfg->{'plugout'};
510 }
511 else{
512 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
513 push @$plugout,"GreenstoneMETSPlugout";
514 }
515 else{
516 push @$plugout,$saveas."Plugout";
517 }
518 }
519 }
520
521 my $plugout_name = $plugout->[0];
522
523 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
524 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
525 push @$plugout,("-debug") if ($debug);
526 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
527 push @$plugout,("-gzip_output") if ($gzip);
528 push @$plugout,("-output_handle",$out) if (defined $out);
529
530 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
531
532 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
533 push @$plugout,("-group") if ($group_marc);
534 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
535 }
536 if ($plugout_name =~ m/^.*METSPlugout$/) {
537 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
538 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
539 }
540
541 if ($plugout_name eq "FedoraMETSPlugout") {
542 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
543 }
544
545 if ($plugout_name eq "DSpacePlugout") {
546 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
547 }
548
549 my $processor = &plugout::load_plugout($plugout);
550 $processor->setoutputdir ($archivedir);
551 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
552
553 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
554
555 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
556
557 if ($removeold) {
558 # occasionally, plugins may want to do something on remove
559 # old, eg pharos image indexing
560 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
561 }
562
563 # process the import directory
564 my $block_hash = {};
565 $block_hash->{'new_files'} = {};
566 $block_hash->{'reindex_files'} = {};
567 my $metadata = {};
568
569 # global blocking pass may set up some metadata
570 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
571
572 if ($manifest ne "") {
573 #
574 # 1. Process delete files first
575 #
576 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
577 my @full_deleted_files = ();
578
579 # ensure all filenames are absolute
580 foreach my $df (@deleted_files) {
581 my $full_df =
582 (&util::filename_is_absolute($df))
583 ? $df
584 : &util::filename_cat($importdir,$df);
585
586 if (-d $full_df) {
587 &add_dir_contents_to_list($full_df, \@full_deleted_files);
588 } else {
589 push(@full_deleted_files,$full_df);
590 }
591 }
592
593 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
594 mark_docs_for_deletion($archive_info,{},
595 \@full_deleted_files,
596 $archivedir, $verbosity, "delete");
597
598
599 #
600 # 2. Now files for reindexing
601 #
602
603 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
604 my @full_reindex_files = ();
605 # ensure all filenames are absolute
606 foreach my $rf (@reindex_files) {
607 my $full_rf =
608 (&util::filename_is_absolute($rf))
609 ? $rf
610 : &util::filename_cat($importdir,$rf);
611
612 if (-d $full_rf) {
613 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
614 } else {
615 push(@full_reindex_files,$full_rf);
616 }
617 }
618
619 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
620 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
621
622 # And now to ensure the new version of the file processed by
623 # appropriate plugin, we need to add it to block_hash reindex list
624 foreach my $full_rf (@full_reindex_files) {
625 $block_hash->{'reindex_files'}->{$full_rf} = 1;
626 }
627
628
629 #
630 # 3. Now finally any new files - add to block_hash new_files list
631 #
632
633 my @new_files = keys %{$manifest_lookup->{'index'}};
634 my @full_new_files = ();
635
636 foreach my $nf (@new_files) {
637 # ensure filename is absolute
638 my $full_nf =
639 (&util::filename_is_absolute($nf))
640 ? $nf
641 : &util::filename_cat($importdir,$nf);
642
643 if (-d $full_nf) {
644 &add_dir_contents_to_list($full_nf, \@full_new_files);
645 } else {
646 push(@full_new_files,$full_nf);
647 }
648 }
649
650 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
651 my $arcinfodb_map = {};
652 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
653 foreach my $f (@full_new_files) {
654 # check that we haven't seen it already
655 if (defined $arcinfodb_map->{$f}) {
656 # TODO make better warning
657 print STDERR "Warning: $f already in src archive, \n";
658 } else {
659 $block_hash->{'new_files'}->{$f} = 1;
660 }
661 }
662
663 undef $arcinfodb_map;
664 }
665 else {
666 # if incremental, we read through the import folder to see whats changed.
667
668 if ($incremental || $incremental_mode eq "onlyadd") {
669 prime_doc_oid_count($archivedir);
670
671 # Can now work out which files were new, already existed, and have
672 # been deleted
673
674 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
675 $archivedir,$verbosity,$incremental_mode);
676
677 my @new_files = sort keys %{$block_hash->{'new_files'}};
678 if (scalar(@new_files>0)) {
679 print STDERR "New files and modified metadata files since last import:\n ";
680 print STDERR join("\n ",@new_files), "\n";
681 }
682
683 if ($incremental) {
684 # only look for deletions if we are truely incremental
685 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
686 # Filter out any in gsdl/tmp area
687 my @filtered_deleted_files = ();
688 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
689 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
690 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
691 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
692
693 foreach my $df (@deleted_files) {
694 next if ($df =~ m/^$gsdl_tmp_area/);
695 next if ($df =~ m/^$collect_tmp_area/);
696
697 push(@filtered_deleted_files,$df);
698 }
699
700
701 @deleted_files = @filtered_deleted_files;
702
703 if (scalar(@deleted_files)>0) {
704 print STDERR "Files deleted since last import:\n ";
705 print STDERR join("\n ",@deleted_files), "\n";
706
707
708 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
709
710 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
711 }
712
713 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
714
715 if (scalar(@reindex_files)>0) {
716 print STDERR "Files to reindex since last import:\n ";
717 print STDERR join("\n ",@reindex_files), "\n";
718 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
719 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
720 }
721
722 }
723 }
724 }
725
726 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
727 # Do nothing if the file already exists (file exists on incremental build).
728 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
729 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
730 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
731 # oailastmodified and oailastmodifieddate
732 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
733 if (!-f $earliestDatestampFile && -d $archivedir) {
734 my $current_time_in_seconds = time; # in seconds
735
736 if(open(FOUT, ">$earliestDatestampFile")) {
737 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
738 print FOUT $current_time_in_seconds;
739 close(FOUT);
740 }
741 else {
742 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
743 }
744
745 }
746
747 # now, whichever mode we are in, we can process the entire import folder
748 if ((defined $jobs) && ($jobs > 1))
749 {
750 # if jobs are set to >1, run in parallel using MPI helper
751 # [hs, 1 july 2010]
752 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
753 $self->{'collection'}, $self->{'site'});
754 }
755 else
756 {
757 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
758 }
759
760
761 if ($saveas eq "FedoraMETS") {
762 # create collection "doc obj" for Fedora that contains
763 # collection-level metadata
764
765 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
766 $doc_obj->set_OID("collection");
767
768 my $col_name = undef;
769 my $col_meta = $collectcfg->{'collectionmeta'};
770
771 if (defined $col_meta) {
772 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
773 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
774 }
775 $processor->process($doc_obj);
776 }
777
778 &plugin::end($pluginfo, $processor);
779
780 &plugin::deinit($pluginfo, $processor);
781
782 # Store the value of OIDCount (used in doc.pm) so it can be
783 # restored correctly to this value on an incremental build
784 store_doc_oid_count($archivedir);
785
786 # write out the archive information file
787 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
788 $processor->close_group_output() if $processor->is_group();
789
790 # for backwards compatability with archvies.inf file
791 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
792 $archive_info->save_info($arcinfo_doc_filename);
793 }
794 else {
795 $archive_info->save_revinfo_db($arcinfo_src_filename);
796 }
797
798 return $pluginfo;
799}
800
801
802sub generate_statistics
803{
804 my $self = shift @_;
805 my ($pluginfo) = @_;
806
807 my $inexport_mode = $self->{'mode'};
808
809 my $statsfile = $self->{'statsfile'};
810 my $out = $self->{'out'};
811 my $faillogname = $self->{'faillogname'};
812 my $gli = $self->{'gli'};
813 my $jobs = $self->{'jobs'};
814
815 # write out import stats
816
817 if ((!defined $jobs) || ($jobs == 1))
818 {
819 # only output statistics if there are multiple jobs
820 # [hs, 1 july 2010]
821
822 my $close_stats = 0;
823 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
824 if (open (STATS, ">$statsfile")) {
825 $statsfile = 'inexport::STATS';
826 $close_stats = 1;
827 } else {
828 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
829 &gsprintf($out, "{import.stats_backup}\n");
830 $statsfile = 'STDERR';
831 }
832 }
833
834 &gsprintf($out, "\n");
835 &gsprintf($out, "*********************************************\n");
836 &gsprintf($out, "{$inexport_mode.complete}\n");
837 &gsprintf($out, "*********************************************\n");
838
839 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
840 if ($close_stats) {
841 close STATS;
842 }
843 }
844
845 close OUT if $self->{'close_out'};
846 close FAILLOG;
847}
848
849
850sub store_collectionmeta
851{
852 my ($collectionmeta,$field,$doc_obj) = @_;
853
854 my $section = $doc_obj->get_top_section();
855
856 my $field_hash = $collectionmeta->{$field};
857
858 foreach my $k (keys %$field_hash)
859 {
860 my $val = $field_hash->{$k};
861
862 ### print STDERR "*** $k = $field_hash->{$k}\n";
863
864 my $md_label = "ex.$field";
865
866
867 if ($k =~ m/^\[l=(.*?)\]$/)
868 {
869
870 my $md_suffix = $1;
871 $md_label .= "^$md_suffix";
872 }
873
874
875 $doc_obj->add_utf8_metadata($section,$md_label, $val);
876
877 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
878 # while "collectionname" in GS2 is called "name" in GS3.
879 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
880 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
881 {
882 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
883 }
884
885 }
886}
887
888
889sub oid_count_file {
890 my ($archivedir) = @_;
891 return &util::filename_cat ($archivedir, "OIDcount");
892}
893
894
895sub prime_doc_oid_count
896{
897 my ($archivedir) = @_;
898 my $oid_count_filename = &oid_count_file($archivedir);
899
900 if (-e $oid_count_filename) {
901 if (open(OIDIN,"<$oid_count_filename")) {
902 my $OIDcount = <OIDIN>;
903 chomp $OIDcount;
904 close(OIDIN);
905
906 $doc::OIDcount = $OIDcount;
907 }
908 else {
909 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
910 }
911 }
912
913}
914
915sub store_doc_oid_count
916{
917 # Use the file "OIDcount" in the archives directory to record
918 # what value doc.pm got up to
919
920 my ($archivedir) = @_;
921 my $oid_count_filename = &oid_count_file($archivedir);
922
923
924 if (open(OIDOUT,">$oid_count_filename")) {
925 print OIDOUT $doc::OIDcount, "\n";
926
927 close(OIDOUT);
928 }
929 else {
930 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
931 }
932}
933
934
935
936sub new_vs_old_import_diff
937{
938 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
939
940 # Get the infodbtype value for this collection from the arcinfo object
941 my $infodbtype = $archive_info->{'infodbtype'};
942
943 # in this method, we want to know if metadata files are modified or not.
944 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
945
946 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
947
948 # First convert all files to absolute form
949 # This is to support the situation where the import folder is not
950 # the default
951
952 my $prev_all_files = $archive_info->{'prev_import_filelist'};
953 my $full_prev_all_files = {};
954
955 foreach my $prev_file (keys %$prev_all_files) {
956
957 if (!&util::filename_is_absolute($prev_file)) {
958 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
959 $full_prev_all_files->{$full_prev_file} = $prev_file;
960 }
961 else {
962 $full_prev_all_files->{$prev_file} = $prev_file;
963 }
964 }
965
966
967 # Figure out which are the new files, existing files and so
968 # by implication the files from the previous import that are not
969 # there any more => mark them for deletion
970 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
971
972 my $full_curr_file = $curr_file;
973
974 # entry in 'all_files' is moved to either 'existing_files',
975 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
976
977 if (!&util::filename_is_absolute($curr_file)) {
978 # add in import dir to make absolute
979 $full_curr_file = &util::filename_cat($importdir,$curr_file);
980 }
981
982 # figure out if new file or not
983 if (defined $full_prev_all_files->{$full_curr_file}) {
984 # delete it so that only files that need deleting are left
985 delete $full_prev_all_files->{$full_curr_file};
986
987 # had it before. is it a metadata file?
988 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
989
990 # is it modified??
991 if (-M $full_curr_file < $archiveinf_timestamp) {
992 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
993 # its newer than last build
994 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
995 }
996 }
997 else {
998 if ($incremental_mode eq "all") {
999
1000 # had it before
1001 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1002
1003 }
1004 else {
1005 # Warning in "onlyadd" mode, but had it before!
1006 print STDERR "Warning: File $full_curr_file previously imported.\n";
1007 print STDERR " Treating as new file\n";
1008
1009 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1010
1011 }
1012 }
1013 }
1014 else {
1015 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1016 # the new file is the special sort of file greenstone uses
1017 # to attach metadata to src documents
1018 # i.e metadata.xml
1019 # (but note, the filename used is not constrained in
1020 # Greenstone to always be this)
1021
1022 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1023 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1024 }
1025 else {
1026 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1027 }
1028 }
1029
1030
1031 delete $block_hash->{'all_files'}->{$curr_file};
1032 }
1033
1034
1035
1036
1037 # Deal with complication of new or modified metadata files by forcing
1038 # everything from this point down in the file hierarchy to
1039 # be freshly imported.
1040 #
1041 # This may mean files that have not changed are reindexed, but does
1042 # guarantee by the end of processing all new metadata is correctly
1043 # associated with the relevant document(s).
1044
1045 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1046 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1047
1048 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1049 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1050
1051 # Go through existing_files, and mark anything that is contained
1052 # within 'situated_dir' to be reindexed (in case some of the metadata
1053 # attaches to one of these files)
1054
1055 my $reindex_files = [];
1056
1057 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1058
1059 if ($existing_f =~ m/^$situated_dir/) {
1060
1061 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1062
1063 push(@$reindex_files,$existing_f);
1064 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1065 delete $block_hash->{'existing_files'}->{$existing_f};
1066
1067 }
1068 }
1069
1070 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1071 # (or equivalent)
1072 $block_hash->{'new_files'}->{$new_mdf} = 1;
1073
1074 }
1075
1076 # go through remaining existing files and work out what has changed and needs to be reindexed.
1077 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1078
1079 my $reindex_files = [];
1080
1081 foreach my $existing_filename (@existing_files) {
1082 if (-M $existing_filename < $archiveinf_timestamp) {
1083 # file is newer than last build
1084
1085 my $existing_file = $existing_filename;
1086 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1087
1088 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1089 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1090
1091 print STDERR "**** Reindexing existing file: $existing_file\n";
1092
1093 push(@$reindex_files,$existing_file);
1094 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1095 }
1096
1097 }
1098
1099
1100 # By this point full_prev_all_files contains the files
1101 # mentioned in archiveinf-src.db but are not in the 'import'
1102 # folder (or whatever was specified through -importdir ...)
1103
1104 # This list can contain files that were created in the 'tmp' or
1105 # 'cache' areas (such as screen-size and thumbnail images).
1106 #
1107 # In building the final list of files to delete, we test to see if
1108 # it exists on the filesystem and if it does (unusual for a "normal"
1109 # file in import, but possible in the case of 'tmp' files),
1110 # supress it from going into the final list
1111
1112 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1113
1114 my @deleted_files = values %$full_prev_all_files;
1115 map { my $curr_file = $_;
1116 my $full_curr_file = $curr_file;
1117
1118 if (!&util::filename_is_absolute($curr_file)) {
1119 # add in import dir to make absolute
1120
1121 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1122 }
1123
1124
1125 if (!-e $full_curr_file) {
1126 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1127 }
1128 } @deleted_files;
1129
1130
1131
1132}
1133
1134
1135# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1136# $mode is 'delete' or 'reindex'
1137sub mark_docs_for_deletion
1138{
1139 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1140
1141 my $mode_text = "deleted from index";
1142 if ($mode eq "reindex") {
1143 $mode_text = "reindexed";
1144 }
1145
1146 # Get the infodbtype value for this collection from the arcinfo object
1147 my $infodbtype = $archive_info->{'infodbtype'};
1148
1149 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1150 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1151
1152
1153 # record files marked for deletion in arcinfo
1154 foreach my $file (@$deleted_files) {
1155 # use 'archiveinf-src' info database file to look up all the OIDs
1156 # that this file is used in (note in most cases, it's just one OID)
1157
1158 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1159 my $oids = $src_rec->{'oid'};
1160 my $file_record_deleted = 0;
1161
1162 # delete the src record
1163 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1164 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1165 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1166
1167
1168 foreach my $oid (@$oids) {
1169
1170 # find the source doc (the primary file that becomes this oid)
1171 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1172 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1173 if (!&util::filename_is_absolute($doc_source_file)) {
1174 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1175 }
1176
1177 if ($doc_source_file ne $file) {
1178 # its an associated or metadata file
1179
1180 # mark source doc for reimport as one of its assoc files has changed or deleted
1181 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1182
1183 }
1184 my $curr_status = $archive_info->get_status_info($oid);
1185 if (defined($curr_status) && (($curr_status ne "D"))) {
1186 if ($verbosity>1) {
1187 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1188 }
1189 # mark oid for deletion (it will be deleted or reimported)
1190 $archive_info->set_status_info($oid,"D");
1191 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1192 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1193
1194 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1195 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1196
1197 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1198 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1199 }
1200 }
1201
1202 }
1203
1204 # now go through and check that we haven't marked any primary
1205 # files for reindex (because their associated files have
1206 # changed/deleted) when they have been deleted themselves. only in
1207 # delete mode.
1208
1209 if ($mode eq "delete") {
1210 foreach my $file (@$deleted_files) {
1211 if (defined $block_hash->{'reindex_files'}->{$file}) {
1212 delete $block_hash->{'reindex_files'}->{$file};
1213 }
1214 }
1215 }
1216
1217
1218}
1219
1220sub add_dir_contents_to_list {
1221
1222 my ($dirname, $list) = @_;
1223
1224 # Recur over directory contents.
1225 my (@dir, $subfile);
1226
1227 # find all the files in the directory
1228 if (!opendir (DIR, $dirname)) {
1229 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1230 return -1; # error in processing
1231 }
1232 @dir = readdir (DIR);
1233 closedir (DIR);
1234
1235 for (my $i = 0; $i < scalar(@dir); $i++) {
1236 my $subfile = $dir[$i];
1237 next if ($subfile =~ m/^\.\.?$/);
1238 next if ($subfile =~ /^\.svn$/);
1239 my $full_file = &util::filename_cat($dirname, $subfile);
1240 if (-d $full_file) {
1241 &add_dir_contents_to_list($full_file, $list);
1242 } else {
1243 push (@$list, $full_file);
1244 }
1245 }
1246
1247}
1248
1249
12501;
Note: See TracBrowser for help on using the repository browser.