source: main/trunk/greenstone2/perllib/inexport.pm@ 26450

Last change on this file since 26450 was 26450, checked in by ak19, 11 years ago
  1. Added GS3 colconfig.xml processing of any OIDtype and OIDmetadata options specified (under new element importOptions) in collectionConfig.xml 2. Any OIDtype and/or OIDmetadata options provided to import.pl on the commandline still override what's in the collectionConfig.xml. Note: as these are not to be written back out to the collectionConfig file, buildcol.pl (which does not take either as argument) will have no recollection of either of these options specified as cmdline arguments to import.pl, and can at best consult whatever may be in collectionConfig.xml
  • Property svn:executable set to *
File size: 39.0 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # set gs_verison 2/3
176 $self->{'gs_version'} = "2";
177 if ((defined $site) && ($site ne "")) {
178 # gs3
179 $self->{'gs_version'} = "3";
180 }
181 # add collection's perllib dir into include path in
182 # case we have collection specific modules
183 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
184
185 # check that we can open the faillog
186 my $faillog = $self->{'faillog'};
187 if ($faillog eq "") {
188 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
189 }
190 open (FAILLOG, ">$faillog") ||
191 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
192
193
194 my $faillogname = $faillog;
195 $faillog = 'inexport::FAILLOG';
196 $faillog->autoflush(1);
197 $self->{'faillog'} = $faillog;
198 $self->{'faillogname'} = $faillogname;
199
200 # Read in the collection configuration file.
201 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
202 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
203
204 return ($config_filename,$collectcfg);
205}
206
207sub set_collection_options
208{
209 my $self = shift @_;
210 my ($collectcfg) = @_;
211
212 my $inexport_mode = $self->{'mode'};
213
214 my $verbosity = $self->{'verbosity'};
215 my $debug = $self->{'debug'};
216 my $importdir = $self->{'importdir'};
217 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
218 my $out = $self->{'out'};
219
220 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
221 if (!defined($collectcfg->{'infodbtype'}))
222 {
223 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
224 }
225 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
226 # we can't use the text version for archives dbs.
227 $collectcfg->{'infodbtype'} = "gdbm";
228 }
229
230 # if OIDtype and/or OIDmetadata args are specified on the commandline, they override what's
231 # in collectcfg but may not overwrite it (by writing it into the collectcfg file)
232 if (defined $self->{'OIDtype'} && $self->{'OIDtype'} =~ /\w/)
233 {
234 $collectcfg->{'OIDtype'} = $self->{'OIDtype'}; # store in the in-memory collectcfg hash
235 }
236 if (defined $self->{'OIDmetadata'} && $self->{'OIDmetadata'} =~ /\w/)
237 {
238 $collectcfg->{'OIDmetadata'} = $self->{'OIDmetadata'}; # store in the in-memory collectcfg hash
239 }
240
241 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
242 $importdir = $collectcfg->{'importdir'};
243 }
244 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
245 $archivedir = $collectcfg->{'archivedir'};
246 }
247 # fill in the default import and archives directories if none
248 # were supplied, turn all \ into / and remove trailing /
249 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
250 $importdir =~ s/[\\\/]+/\//g;
251 $importdir =~ s/\/$//;
252 if (!-e $importdir) {
253 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
254 die "\n";
255 }
256 $self->{'importdir'} = $importdir;
257
258 if ($archivedir eq "") {
259 if ($inexport_mode eq "import") {
260 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
261 }
262 elsif ($inexport_mode eq "export") {
263 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
264 }
265 else {
266 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
267 print STDERR " Defaulting to 'archives' for file output\n";
268 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
269 }
270 }
271
272 $archivedir =~ s/[\\\/]+/\//g;
273 $archivedir =~ s/\/$//;
274 $self->{'archivedir'} = $archivedir;
275
276 if ($verbosity !~ /\d+/) {
277 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
278 $verbosity = $collectcfg->{'verbosity'};
279 } else {
280 $verbosity = 2; # the default
281 }
282 }
283 $self->{'verbosity'} = $verbosity;
284
285 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
286 $self->{'manifest'} = $collectcfg->{'manifest'};
287 }
288
289 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
290 if ($collectcfg->{'gzip'} =~ /^true$/i) {
291 $self->{'gzip'} = 1;
292 }
293 }
294
295 if ($self->{'maxdocs'} !~ /\-?\d+/) {
296 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
297 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
298 } else {
299 $self->{'maxdocs'} = -1; # the default
300 }
301 }
302
303 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
304 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
305 $self->{'groupsize'} = $collectcfg->{'groupsize'};
306 }
307 }
308
309 if (!defined $self->{'OIDtype'}
310 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
311 if (defined $collectcfg->{'OIDtype'}
312 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
313 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
314 } else {
315 $self->{'OIDtype'} = "hash"; # the default
316 }
317 }
318
319 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
320 if (defined $collectcfg->{'OIDmetadata'}) {
321 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
322 } else {
323 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
324 }
325 }
326
327 my $sortmeta = $self->{'sortmeta'};
328 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
329 $sortmeta = $collectcfg->{'sortmeta'};
330 }
331 # sortmeta cannot be used with group size
332 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
333 if (defined $sortmeta && $self->{'groupsize'} > 1) {
334 &gsprintf($out, "{import.cannot_sort}\n\n");
335 $sortmeta = undef;
336 }
337 $self->{'sortmeta'} = $sortmeta;
338
339 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
340 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
341 }
342
343 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
344 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
345 }
346 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
347 $self->{'debug'} = 1;
348 }
349 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
350 $self->{'gli'} = 1;
351 }
352 $self->{'gli'} = 0 unless defined $self->{'gli'};
353
354 # check keepold and removeold
355 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
356
357 my ($removeold, $keepold, $incremental, $incremental_mode)
358 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
359 $self->{'incremental'}, $checkdir,
360 $collectcfg);
361
362 $self->{'removeold'} = $removeold;
363 $self->{'keepold'} = $keepold;
364 $self->{'incremental'} = $incremental;
365 $self->{'incremental_mode'} = $incremental_mode;
366}
367
368sub process_files
369{
370 my $self = shift @_;
371 my ($config_filename,$collectcfg) = @_;
372
373 my $inexport_mode = $self->{'mode'};
374
375 my $verbosity = $self->{'verbosity'};
376 my $debug = $self->{'debug'};
377
378 my $importdir = $self->{'importdir'};
379 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
380
381 my $incremental = $self->{'incremental'};
382 my $incremental_mode = $self->{'incremental_mode'};
383
384 my $gs_version = $self->{'gs_version'};
385
386 my $removeold = $self->{'removeold'};
387 my $keepold = $self->{'keepold'};
388
389 my $saveas = $self->{'saveas'};
390 my $OIDtype = $self->{'OIDtype'};
391 my $OIDmetadata = $self->{'OIDmetadata'};
392
393 my $out = $self->{'out'};
394 my $faillog = $self->{'faillog'};
395
396 my $maxdocs = $self->{'maxdocs'};
397 my $gzip = $self->{'gzip'};
398 my $groupsize = $self->{'groupsize'};
399 my $sortmeta = $self->{'sortmeta'};
400
401 my $removeprefix = $self->{'removeprefix'};
402 my $removesuffix = $self->{'removesuffix'};
403
404 my $gli = $self->{'gli'};
405
406 my $jobs = $self->{'jobs'};
407 my $epoch = $self->{'epoch'};
408
409 # related to export
410 my $xsltfile = $self->{'xsltfile'};
411 my $group_marc = $self->{'group_marc'};
412 my $mapping_file = $self->{'mapping_file'};
413 my $xslt_mets = $self->{'xslt_mets'};
414 my $xslt_txt = $self->{'xslt_txt'};
415 my $fedora_namespace = $self->{'fedora_namespace'};
416 my $metadata_prefix = $self->{'metadata_prefix'};
417
418 if ($inexport_mode eq "import") {
419 print STDERR "<Import>\n" if $gli;
420 }
421 else {
422 print STDERR "<export>\n" if $gli;
423 }
424
425 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
426 if ($self->{'manifest'} ne "") {
427 my $manifest_filename = $self->{'manifest'};
428
429 if (!&util::filename_is_absolute($manifest_filename)) {
430 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
431 }
432
433 $self->{'manifest'} =~ s/[\\\/]+/\//g;
434 $self->{'manifest'} =~ s/\/$//;
435
436 $manifest_lookup->parse($manifest_filename);
437 }
438
439 my $manifest = $self->{'manifest'};
440
441 # load all the plugins
442 my $plugins = [];
443 if (defined $collectcfg->{'plugin'}) {
444 $plugins = $collectcfg->{'plugin'};
445 }
446
447 my $plugin_incr_mode = $incremental_mode;
448 if ($manifest ne "") {
449 # if we have a manifest file, then we pretend we are fully incremental for plugins
450 $plugin_incr_mode = "all";
451 }
452 #some global options for the plugins
453 my @global_opts = ();
454
455 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
456 if (scalar(@$pluginfo) == 0) {
457 &gsprintf($out, "{import.no_plugins_loaded}\n");
458 die "\n";
459 }
460
461 # remove the old contents of the archives directory (and tmp
462 # directory) if needed
463
464 if ($removeold) {
465 if (-e $archivedir) {
466 &gsprintf($out, "{import.removing_archives}\n");
467 &util::rm_r ($archivedir);
468 }
469 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
470 $tmpdir =~ s/[\\\/]+/\//g;
471 $tmpdir =~ s/\/$//;
472 if (-e $tmpdir) {
473 &gsprintf($out, "{import.removing_tmpdir}\n");
474 &util::rm_r ($tmpdir);
475 }
476 }
477
478 # create the archives dir if needed
479 &util::mk_all_dir($archivedir);
480
481 # read the archive information file
482
483 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
484 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
485 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
486
487 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
488 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
489
490 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
491 $archive_info->load_info ($arcinfo_doc_filename);
492
493 if ($manifest eq "") {
494 # Load in list of files in import folder from last import (if present)
495 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
496 }
497
498 ####Use Plugout####
499 my $plugout;
500
501 if ($inexport_mode eq "import") {
502 if (defined $collectcfg->{'plugout'}) {
503 # If a plugout was specified in the collect.cfg file, assume it is sensible
504 # We can't check the name because it could be anything, if it is a custom plugout
505 $plugout = $collectcfg->{'plugout'};
506 }
507 else{
508 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
509 push @$plugout,"GreenstoneXMLPlugout";
510 }
511 else{
512 push @$plugout,$saveas."Plugout";
513 }
514 }
515 }
516 else {
517 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
518 $plugout = $collectcfg->{'plugout'};
519 }
520 else{
521 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
522 push @$plugout,"GreenstoneMETSPlugout";
523 }
524 else{
525 push @$plugout,$saveas."Plugout";
526 }
527 }
528 }
529
530 my $plugout_name = $plugout->[0];
531
532 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
533 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
534 push @$plugout,("-debug") if ($debug);
535 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
536 push @$plugout,("-gzip_output") if ($gzip);
537 push @$plugout,("-output_handle",$out) if (defined $out);
538
539 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
540
541 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
542 push @$plugout,("-group") if ($group_marc);
543 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
544 }
545 if ($plugout_name =~ m/^.*METSPlugout$/) {
546 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
547 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
548 }
549
550 if ($plugout_name eq "FedoraMETSPlugout") {
551 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
552 }
553
554 if ($plugout_name eq "DSpacePlugout") {
555 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
556 }
557
558 my $processor = &plugout::load_plugout($plugout);
559 $processor->setoutputdir ($archivedir);
560 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
561 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
562
563 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
564
565 if ($removeold) {
566 # occasionally, plugins may want to do something on remove
567 # old, eg pharos image indexing
568 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
569 }
570
571 # process the import directory
572 my $block_hash = {};
573 $block_hash->{'new_files'} = {};
574 $block_hash->{'reindex_files'} = {};
575 my $metadata = {};
576
577 # global blocking pass may set up some metadata
578 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
579
580 if ($manifest ne "") {
581 #
582 # 1. Process delete files first
583 #
584 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
585 my @full_deleted_files = ();
586
587 # ensure all filenames are absolute
588 foreach my $df (@deleted_files) {
589 my $full_df =
590 (&util::filename_is_absolute($df))
591 ? $df
592 : &util::filename_cat($importdir,$df);
593
594 if (-d $full_df) {
595 &add_dir_contents_to_list($full_df, \@full_deleted_files);
596 } else {
597 push(@full_deleted_files,$full_df);
598 }
599 }
600
601 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
602 mark_docs_for_deletion($archive_info,{},
603 \@full_deleted_files,
604 $archivedir, $verbosity, "delete");
605
606
607 #
608 # 2. Now files for reindexing
609 #
610
611 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
612 my @full_reindex_files = ();
613 # ensure all filenames are absolute
614 foreach my $rf (@reindex_files) {
615 my $full_rf =
616 (&util::filename_is_absolute($rf))
617 ? $rf
618 : &util::filename_cat($importdir,$rf);
619
620 if (-d $full_rf) {
621 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
622 } else {
623 push(@full_reindex_files,$full_rf);
624 }
625 }
626
627 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
628 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
629
630 # And now to ensure the new version of the file processed by
631 # appropriate plugin, we need to add it to block_hash reindex list
632 foreach my $full_rf (@full_reindex_files) {
633 $block_hash->{'reindex_files'}->{$full_rf} = 1;
634 }
635
636
637 #
638 # 3. Now finally any new files - add to block_hash new_files list
639 #
640
641 my @new_files = keys %{$manifest_lookup->{'index'}};
642 my @full_new_files = ();
643
644 foreach my $nf (@new_files) {
645 # ensure filename is absolute
646 my $full_nf =
647 (&util::filename_is_absolute($nf))
648 ? $nf
649 : &util::filename_cat($importdir,$nf);
650
651 if (-d $full_nf) {
652 &add_dir_contents_to_list($full_nf, \@full_new_files);
653 } else {
654 push(@full_new_files,$full_nf);
655 }
656 }
657
658 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
659 my $arcinfodb_map = {};
660 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
661 foreach my $f (@full_new_files) {
662 # check that we haven't seen it already
663 if (defined $arcinfodb_map->{$f}) {
664 # TODO make better warning
665 print STDERR "Warning: $f already in src archive, \n";
666 } else {
667 $block_hash->{'new_files'}->{$f} = 1;
668 }
669 }
670
671 undef $arcinfodb_map;
672 }
673 else {
674 # if incremental, we read through the import folder to see whats changed.
675
676 if ($incremental || $incremental_mode eq "onlyadd") {
677 prime_doc_oid_count($archivedir);
678
679 # Can now work out which files were new, already existed, and have
680 # been deleted
681
682 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
683 $archivedir,$verbosity,$incremental_mode);
684
685 my @new_files = sort keys %{$block_hash->{'new_files'}};
686 if (scalar(@new_files>0)) {
687 print STDERR "New files and modified metadata files since last import:\n ";
688 print STDERR join("\n ",@new_files), "\n";
689 }
690
691 if ($incremental) {
692 # only look for deletions if we are truely incremental
693 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
694 # Filter out any in gsdl/tmp area
695 my @filtered_deleted_files = ();
696 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
697 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
698 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
699 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
700
701 foreach my $df (@deleted_files) {
702 next if ($df =~ m/^$gsdl_tmp_area/);
703 next if ($df =~ m/^$collect_tmp_area/);
704
705 push(@filtered_deleted_files,$df);
706 }
707
708
709 @deleted_files = @filtered_deleted_files;
710
711 if (scalar(@deleted_files)>0) {
712 print STDERR "Files deleted since last import:\n ";
713 print STDERR join("\n ",@deleted_files), "\n";
714
715
716 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
717
718 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
719 }
720
721 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
722
723 if (scalar(@reindex_files)>0) {
724 print STDERR "Files to reindex since last import:\n ";
725 print STDERR join("\n ",@reindex_files), "\n";
726 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
727 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
728 }
729
730 }
731 }
732 }
733
734 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
735 # Do nothing if the file already exists (file exists on incremental build).
736 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
737 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
738 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
739 # oailastmodified and oailastmodifieddate
740 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
741 if (!-f $earliestDatestampFile && -d $archivedir) {
742 my $current_time_in_seconds = time; # in seconds
743
744 if(open(FOUT, ">$earliestDatestampFile")) {
745 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
746 print FOUT $current_time_in_seconds;
747 close(FOUT);
748 }
749 else {
750 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
751 }
752
753 }
754
755 # now, whichever mode we are in, we can process the entire import folder
756 if ((defined $jobs) && ($jobs > 1))
757 {
758 # if jobs are set to >1, run in parallel using MPI helper
759 # [hs, 1 july 2010]
760 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
761 $self->{'collection'}, $self->{'site'});
762 }
763 else
764 {
765 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
766 }
767
768
769 if ($saveas eq "FedoraMETS") {
770 # create collection "doc obj" for Fedora that contains
771 # collection-level metadata
772
773 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
774 $doc_obj->set_OID("collection");
775
776 my $col_name = undef;
777 my $col_meta = $collectcfg->{'collectionmeta'};
778
779 if (defined $col_meta) {
780 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
781 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
782 }
783 $processor->process($doc_obj);
784 }
785
786 &plugin::end($pluginfo, $processor);
787
788 &plugin::deinit($pluginfo, $processor);
789
790 # Store the value of OIDCount (used in doc.pm) so it can be
791 # restored correctly to this value on an incremental build
792 store_doc_oid_count($archivedir);
793
794 # write out the archive information file
795 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
796 $processor->close_group_output() if $processor->is_group();
797
798 # for backwards compatability with archvies.inf file
799 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
800 $archive_info->save_info($arcinfo_doc_filename);
801 }
802 else {
803 $archive_info->save_revinfo_db($arcinfo_src_filename);
804 }
805
806 return $pluginfo;
807}
808
809
810sub generate_statistics
811{
812 my $self = shift @_;
813 my ($pluginfo) = @_;
814
815 my $inexport_mode = $self->{'mode'};
816
817 my $statsfile = $self->{'statsfile'};
818 my $out = $self->{'out'};
819 my $faillogname = $self->{'faillogname'};
820 my $gli = $self->{'gli'};
821 my $jobs = $self->{'jobs'};
822
823 # write out import stats
824
825 if ((!defined $jobs) || ($jobs == 1))
826 {
827 # only output statistics if there are multiple jobs
828 # [hs, 1 july 2010]
829
830 my $close_stats = 0;
831 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
832 if (open (STATS, ">$statsfile")) {
833 $statsfile = 'inexport::STATS';
834 $close_stats = 1;
835 } else {
836 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
837 &gsprintf($out, "{import.stats_backup}\n");
838 $statsfile = 'STDERR';
839 }
840 }
841
842 &gsprintf($out, "\n");
843 &gsprintf($out, "*********************************************\n");
844 &gsprintf($out, "{$inexport_mode.complete}\n");
845 &gsprintf($out, "*********************************************\n");
846
847 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
848 if ($close_stats) {
849 close STATS;
850 }
851 }
852
853 close OUT if $self->{'close_out'};
854 close FAILLOG;
855}
856
857
858sub store_collectionmeta
859{
860 my ($collectionmeta,$field,$doc_obj) = @_;
861
862 my $section = $doc_obj->get_top_section();
863
864 my $field_hash = $collectionmeta->{$field};
865
866 foreach my $k (keys %$field_hash)
867 {
868 my $val = $field_hash->{$k};
869
870 ### print STDERR "*** $k = $field_hash->{$k}\n";
871
872 my $md_label = "ex.$field";
873
874
875 if ($k =~ m/^\[l=(.*?)\]$/)
876 {
877
878 my $md_suffix = $1;
879 $md_label .= "^$md_suffix";
880 }
881
882
883 $doc_obj->add_utf8_metadata($section,$md_label, $val);
884
885 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
886 # while "collectionname" in GS2 is called "name" in GS3.
887 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
888 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
889 {
890 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
891 }
892
893 }
894}
895
896
897sub oid_count_file {
898 my ($archivedir) = @_;
899 return &util::filename_cat ($archivedir, "OIDcount");
900}
901
902
903sub prime_doc_oid_count
904{
905 my ($archivedir) = @_;
906 my $oid_count_filename = &oid_count_file($archivedir);
907
908 if (-e $oid_count_filename) {
909 if (open(OIDIN,"<$oid_count_filename")) {
910 my $OIDcount = <OIDIN>;
911 chomp $OIDcount;
912 close(OIDIN);
913
914 $doc::OIDcount = $OIDcount;
915 }
916 else {
917 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
918 }
919 }
920
921}
922
923sub store_doc_oid_count
924{
925 # Use the file "OIDcount" in the archives directory to record
926 # what value doc.pm got up to
927
928 my ($archivedir) = @_;
929 my $oid_count_filename = &oid_count_file($archivedir);
930
931
932 if (open(OIDOUT,">$oid_count_filename")) {
933 print OIDOUT $doc::OIDcount, "\n";
934
935 close(OIDOUT);
936 }
937 else {
938 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
939 }
940}
941
942
943
944sub new_vs_old_import_diff
945{
946 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
947
948 # Get the infodbtype value for this collection from the arcinfo object
949 my $infodbtype = $archive_info->{'infodbtype'};
950
951 # in this method, we want to know if metadata files are modified or not.
952 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
953
954 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
955
956 # First convert all files to absolute form
957 # This is to support the situation where the import folder is not
958 # the default
959
960 my $prev_all_files = $archive_info->{'prev_import_filelist'};
961 my $full_prev_all_files = {};
962
963 foreach my $prev_file (keys %$prev_all_files) {
964
965 if (!&util::filename_is_absolute($prev_file)) {
966 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
967 $full_prev_all_files->{$full_prev_file} = $prev_file;
968 }
969 else {
970 $full_prev_all_files->{$prev_file} = $prev_file;
971 }
972 }
973
974
975 # Figure out which are the new files, existing files and so
976 # by implication the files from the previous import that are not
977 # there any more => mark them for deletion
978 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
979
980 my $full_curr_file = $curr_file;
981
982 # entry in 'all_files' is moved to either 'existing_files',
983 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
984
985 if (!&util::filename_is_absolute($curr_file)) {
986 # add in import dir to make absolute
987 $full_curr_file = &util::filename_cat($importdir,$curr_file);
988 }
989
990 # figure out if new file or not
991 if (defined $full_prev_all_files->{$full_curr_file}) {
992 # delete it so that only files that need deleting are left
993 delete $full_prev_all_files->{$full_curr_file};
994
995 # had it before. is it a metadata file?
996 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
997
998 # is it modified??
999 if (-M $full_curr_file < $archiveinf_timestamp) {
1000 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
1001 # its newer than last build
1002 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1003 }
1004 }
1005 else {
1006 if ($incremental_mode eq "all") {
1007
1008 # had it before
1009 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
1010
1011 }
1012 else {
1013 # Warning in "onlyadd" mode, but had it before!
1014 print STDERR "Warning: File $full_curr_file previously imported.\n";
1015 print STDERR " Treating as new file\n";
1016
1017 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1018
1019 }
1020 }
1021 }
1022 else {
1023 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1024 # the new file is the special sort of file greenstone uses
1025 # to attach metadata to src documents
1026 # i.e metadata.xml
1027 # (but note, the filename used is not constrained in
1028 # Greenstone to always be this)
1029
1030 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1031 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1032 }
1033 else {
1034 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1035 }
1036 }
1037
1038
1039 delete $block_hash->{'all_files'}->{$curr_file};
1040 }
1041
1042
1043
1044
1045 # Deal with complication of new or modified metadata files by forcing
1046 # everything from this point down in the file hierarchy to
1047 # be freshly imported.
1048 #
1049 # This may mean files that have not changed are reindexed, but does
1050 # guarantee by the end of processing all new metadata is correctly
1051 # associated with the relevant document(s).
1052
1053 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1054 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1055
1056 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1057 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1058
1059 # Go through existing_files, and mark anything that is contained
1060 # within 'situated_dir' to be reindexed (in case some of the metadata
1061 # attaches to one of these files)
1062
1063 my $reindex_files = [];
1064
1065 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1066
1067 if ($existing_f =~ m/^$situated_dir/) {
1068
1069 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1070
1071 push(@$reindex_files,$existing_f);
1072 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1073 delete $block_hash->{'existing_files'}->{$existing_f};
1074
1075 }
1076 }
1077
1078 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1079 # (or equivalent)
1080 $block_hash->{'new_files'}->{$new_mdf} = 1;
1081
1082 }
1083
1084 # go through remaining existing files and work out what has changed and needs to be reindexed.
1085 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1086
1087 my $reindex_files = [];
1088
1089 foreach my $existing_filename (@existing_files) {
1090 if (-M $existing_filename < $archiveinf_timestamp) {
1091 # file is newer than last build
1092
1093 my $existing_file = $existing_filename;
1094 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1095
1096 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1097 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1098
1099 print STDERR "**** Reindexing existing file: $existing_file\n";
1100
1101 push(@$reindex_files,$existing_file);
1102 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1103 }
1104
1105 }
1106
1107
1108 # By this point full_prev_all_files contains the files
1109 # mentioned in archiveinf-src.db but are not in the 'import'
1110 # folder (or whatever was specified through -importdir ...)
1111
1112 # This list can contain files that were created in the 'tmp' or
1113 # 'cache' areas (such as screen-size and thumbnail images).
1114 #
1115 # In building the final list of files to delete, we test to see if
1116 # it exists on the filesystem and if it does (unusual for a "normal"
1117 # file in import, but possible in the case of 'tmp' files),
1118 # supress it from going into the final list
1119
1120 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1121
1122 my @deleted_files = values %$full_prev_all_files;
1123 map { my $curr_file = $_;
1124 my $full_curr_file = $curr_file;
1125
1126 if (!&util::filename_is_absolute($curr_file)) {
1127 # add in import dir to make absolute
1128
1129 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1130 }
1131
1132
1133 if (!-e $full_curr_file) {
1134 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1135 }
1136 } @deleted_files;
1137
1138
1139
1140}
1141
1142
1143# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1144# $mode is 'delete' or 'reindex'
1145sub mark_docs_for_deletion
1146{
1147 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1148
1149 my $mode_text = "deleted from index";
1150 if ($mode eq "reindex") {
1151 $mode_text = "reindexed";
1152 }
1153
1154 # Get the infodbtype value for this collection from the arcinfo object
1155 my $infodbtype = $archive_info->{'infodbtype'};
1156
1157 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1158 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1159
1160
1161 # record files marked for deletion in arcinfo
1162 foreach my $file (@$deleted_files) {
1163 # use 'archiveinf-src' info database file to look up all the OIDs
1164 # that this file is used in (note in most cases, it's just one OID)
1165
1166 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1167 my $oids = $src_rec->{'oid'};
1168 my $file_record_deleted = 0;
1169
1170 # delete the src record
1171 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1172 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1173 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1174
1175
1176 foreach my $oid (@$oids) {
1177
1178 # find the source doc (the primary file that becomes this oid)
1179 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1180 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1181 if (!&util::filename_is_absolute($doc_source_file)) {
1182 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1183 }
1184
1185 if ($doc_source_file ne $file) {
1186 # its an associated or metadata file
1187
1188 # mark source doc for reimport as one of its assoc files has changed or deleted
1189 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1190
1191 }
1192 my $curr_status = $archive_info->get_status_info($oid);
1193 if (defined($curr_status) && (($curr_status ne "D"))) {
1194 if ($verbosity>1) {
1195 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1196 }
1197 # mark oid for deletion (it will be deleted or reimported)
1198 $archive_info->set_status_info($oid,"D");
1199 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1200 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1201
1202 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1203 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1204
1205 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1206 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1207 }
1208 }
1209
1210 }
1211
1212 # now go through and check that we haven't marked any primary
1213 # files for reindex (because their associated files have
1214 # changed/deleted) when they have been deleted themselves. only in
1215 # delete mode.
1216
1217 if ($mode eq "delete") {
1218 foreach my $file (@$deleted_files) {
1219 if (defined $block_hash->{'reindex_files'}->{$file}) {
1220 delete $block_hash->{'reindex_files'}->{$file};
1221 }
1222 }
1223 }
1224
1225
1226}
1227
1228sub add_dir_contents_to_list {
1229
1230 my ($dirname, $list) = @_;
1231
1232 # Recur over directory contents.
1233 my (@dir, $subfile);
1234
1235 # find all the files in the directory
1236 if (!opendir (DIR, $dirname)) {
1237 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1238 return -1; # error in processing
1239 }
1240 @dir = readdir (DIR);
1241 closedir (DIR);
1242
1243 for (my $i = 0; $i < scalar(@dir); $i++) {
1244 my $subfile = $dir[$i];
1245 next if ($subfile =~ m/^\.\.?$/);
1246 next if ($subfile =~ /^\.svn$/);
1247 my $full_file = &util::filename_cat($dirname, $subfile);
1248 if (-d $full_file) {
1249 &add_dir_contents_to_list($full_file, $list);
1250 } else {
1251 push (@$list, $full_file);
1252 }
1253 }
1254
1255}
1256
1257
12581;
Note: See TracBrowser for help on using the repository browser.