source: main/trunk/greenstone2/perllib/inexport.pm@ 23132

Last change on this file since 23132 was 23132, checked in by kjdon, 14 years ago

for manifest files, if the user has specified Index (not Reindex) and the file already existed, then print a warning and don't process it again.

  • Property svn:executable set to *
File size: 36.3 KB
RevLine 
[18457]1###########################################################################
2#
[22413]3# inexport.pm -- useful class to support import.pl and export.pl
[18457]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
[22413]30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[19789]32
[22413]33use arcinfo;
34use colcfg;
[21553]35use dbutil;
[22464]36use doc;
[22413]37use plugin;
38use plugout;
39use manifest;
40use inexport;
[18457]41use util;
[22413]42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
[18457]47
[22413]48use File::Basename;
[21563]49
[22413]50sub new
51{
52 my $class = shift (@_);
[22421]53 my ($mode,$argv,$options,$opt_listall_options) = @_;
[22413]54
[22421]55 my $self = { 'xml' => 0, 'mode' => $mode };
[22413]56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
[22421]74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
[22413]86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
[22460]89 return bless $self, $class;
[22413]90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
[22460]100
[22413]101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
[23042]112 $out = 'inexport::OUT';
[22413]113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
[22445]121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
[22413]125 return bless $self, $class;
126}
127
128sub get_collection
129{
130 my $self = shift @_;
131
132 return $self->{'collection'};
133}
134
135
136sub read_collection_cfg
137{
138 my $self = shift @_;
139 my ($collection,$options) = @_;
140
141 my $collectdir = $self->{'collectdir'};
142 my $site = $self->{'site'};
143 my $out = $self->{'out'};
[22567]144
[22413]145 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
146 &PrintUsage::print_txt_usage($options, "{import.params}");
147 die "\n";
148 }
149
150 # add collection's perllib dir into include path in
151 # case we have collection specific modules
152 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
153
154 # check that we can open the faillog
155 my $faillog = $self->{'faillog'};
156 if ($faillog eq "") {
157 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
158 }
159 open (FAILLOG, ">$faillog") ||
160 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
161
162
163 my $faillogname = $faillog;
164 $faillog = 'inexport::FAILLOG';
165 $faillog->autoflush(1);
166 $self->{'faillog'} = $faillog;
167 $self->{'faillogname'} = $faillogname;
168
169 # Read in the collection configuration file.
[22421]170 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
171 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
[22413]172
[22421]173 return ($config_filename,$collectcfg);
[22413]174}
175
176sub set_collection_options
177{
178 my $self = shift @_;
[22421]179 my ($collectcfg) = @_;
[22413]180
[22421]181 my $inexport_mode = $self->{'mode'};
182
[22413]183 my $verbosity = $self->{'verbosity'};
184 my $debug = $self->{'debug'};
185 my $importdir = $self->{'importdir'};
[22466]186 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
[22413]187 my $out = $self->{'out'};
188
189 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
190 if (!defined($collectcfg->{'infodbtype'}))
191 {
192 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
193 }
194
195 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
196 $importdir = $collectcfg->{'importdir'};
197 }
198 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
199 $archivedir = $collectcfg->{'archivedir'};
200 }
201 # fill in the default import and archives directories if none
202 # were supplied, turn all \ into / and remove trailing /
203 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
204 $importdir =~ s/[\\\/]+/\//g;
205 $importdir =~ s/\/$//;
206 if (!-e $importdir) {
207 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
208 die "\n";
209 }
210 $self->{'importdir'} = $importdir;
211
212 if ($archivedir eq "") {
213 if ($inexport_mode eq "import") {
214 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
215 }
216 elsif ($inexport_mode eq "export") {
217 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
218 }
219 else {
220 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
221 print STDERR " Defaulting to 'archives' for file output\n";
222 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
223 }
224 }
225
226 $archivedir =~ s/[\\\/]+/\//g;
227 $archivedir =~ s/\/$//;
228 $self->{'archivedir'} = $archivedir;
229
230 if ($verbosity !~ /\d+/) {
231 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
232 $verbosity = $collectcfg->{'verbosity'};
233 } else {
234 $verbosity = 2; # the default
235 }
236 }
[22421]237 $self->{'verbosity'} = $verbosity;
238
[22413]239 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
240 $self->{'manifest'} = $collectcfg->{'manifest'};
241 }
242
243 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
244 if ($collectcfg->{'gzip'} =~ /^true$/i) {
245 $self->{'gzip'} = 1;
246 }
247 }
248
249 if ($self->{'maxdocs'} !~ /\-?\d+/) {
250 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
251 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
252 } else {
253 $self->{'maxdocs'} = -1; # the default
254 }
255 }
[22421]256
257 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
[22413]258 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
259 $self->{'groupsize'} = $collectcfg->{'groupsize'};
260 }
261 }
262
263 if (!defined $self->{'OIDtype'}
264 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
265 if (defined $collectcfg->{'OIDtype'}
266 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
267 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
268 } else {
269 $self->{'OIDtype'} = "hash"; # the default
270 }
271 }
272
273 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
274 if (defined $collectcfg->{'OIDmetadata'}) {
275 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
276 } else {
277 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
278 }
279 }
280
281 my $sortmeta = $self->{'sortmeta'};
282 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
283 $sortmeta = $collectcfg->{'sortmeta'};
284 }
285 # sortmeta cannot be used with group size
286 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
287 if (defined $sortmeta && $self->{'groupsize'} > 1) {
288 &gsprintf($out, "{import.cannot_sort}\n\n");
289 $sortmeta = undef;
290 }
291 $self->{'sortmeta'} = $sortmeta;
292
293 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
294 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
295 }
296
297 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
298 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
299 }
300 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
301 $self->{'debug'} = 1;
302 }
303 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
304 $self->{'gli'} = 1;
305 }
306 $self->{'gli'} = 0 unless defined $self->{'gli'};
307
308 # check keepold and removeold
[22421]309 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
310
[22413]311 my ($removeold, $keepold, $incremental, $incremental_mode)
312 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
[22421]313 $self->{'incremental'}, $checkdir,
[22413]314 $collectcfg);
315
316 $self->{'removeold'} = $removeold;
317 $self->{'keepold'} = $keepold;
318 $self->{'incremental'} = $incremental;
319 $self->{'incremental_mode'} = $incremental_mode;
320}
321
322sub process_files
323{
324 my $self = shift @_;
[22421]325 my ($config_filename,$collectcfg) = @_;
[22413]326
[22421]327 my $inexport_mode = $self->{'mode'};
328
[22413]329 my $verbosity = $self->{'verbosity'};
330 my $debug = $self->{'debug'};
331
332 my $importdir = $self->{'importdir'};
[22460]333 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
[22413]334
335 my $incremental = $self->{'incremental'};
336 my $incremental_mode = $self->{'incremental_mode'};
337
338 my $removeold = $self->{'removeold'};
339 my $keepold = $self->{'keepold'};
340
341 my $saveas = $self->{'saveas'};
342 my $OIDtype = $self->{'OIDtype'};
343 my $OIDmetadata = $self->{'OIDmetadata'};
344
345 my $out = $self->{'out'};
346 my $faillog = $self->{'faillog'};
347
348 my $maxdocs = $self->{'maxdocs'};
349 my $gzip = $self->{'gzip'};
350 my $groupsize = $self->{'groupsize'};
351 my $sortmeta = $self->{'sortmeta'};
352
353 my $removeprefix = $self->{'removeprefix'};
354 my $removesuffix = $self->{'removesuffix'};
355
[22421]356 my $gli = $self->{'gli'};
[22413]357
[22445]358 my $jobs = $self->{'jobs'};
359 my $epoch = $self->{'epoch'};
360
[22421]361 # related to export
362 my $xsltfile = $self->{'xsltfile'};
363 my $group_marc = $self->{'group_marc'};
364 my $mapping_file = $self->{'mapping_file'};
365 my $xslt_mets = $self->{'xslt_mets'};
366 my $xslt_txt = $self->{'xslt_txt'};
367 my $fedora_namespace = $self->{'fedora_namespace'};
368
369 if ($inexport_mode eq "import") {
370 print STDERR "<Import>\n" if $gli;
371 }
372 else {
373 print STDERR "<export>\n" if $gli;
374 }
[22413]375
376 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
[23053]377 if ($self->{'manifest'} ne "") {
[22413]378 my $manifest_filename = $self->{'manifest'};
379
380 if (!&util::filename_is_absolute($manifest_filename)) {
381 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
382 }
383
384 $self->{'manifest'} =~ s/[\\\/]+/\//g;
385 $self->{'manifest'} =~ s/\/$//;
386
387 $manifest_lookup->parse($manifest_filename);
388 }
389
390 my $manifest = $self->{'manifest'};
391
392 # load all the plugins
393 my $plugins = [];
394 if (defined $collectcfg->{'plugin'}) {
395 $plugins = $collectcfg->{'plugin'};
396 }
397
[23053]398 my $plugin_incr_mode = $incremental_mode;
399 if ($manifest ne "") {
400 # if we have a manifest file, then we pretend we are fully incremental for plugins
401 $plugin_incr_mode = "all";
402 }
[22413]403 #some global options for the plugins
404 my @global_opts = ();
405
[23053]406 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
[22413]407 if (scalar(@$pluginfo) == 0) {
408 &gsprintf($out, "{import.no_plugins_loaded}\n");
409 die "\n";
410 }
411
412 # remove the old contents of the archives directory (and tmp directory) if needed
413 if ($removeold) {
414 if (-e $archivedir) {
415 &gsprintf($out, "{import.removing_archives}\n");
416 &util::rm_r ($archivedir);
417 }
418 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
419 $tmpdir =~ s/[\\\/]+/\//g;
420 $tmpdir =~ s/\/$//;
421 if (-e $tmpdir) {
422 &gsprintf($out, "{import.removing_tmpdir}\n");
423 &util::rm_r ($tmpdir);
424 }
425 }
426
427 # create the archives dir if needed
428 &util::mk_all_dir($archivedir);
429
430 # read the archive information file
431
432 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
433 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
434 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
435
436 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
437 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
438
439 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
440 $archive_info->load_info ($arcinfo_doc_filename);
441
442 if ($manifest eq "") {
443 # Load in list of files in import folder from last import (if present)
444 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
445 }
446
447 ####Use Plugout####
[22421]448 my $plugout;
449
450 if ($inexport_mode eq "import") {
451 if (defined $collectcfg->{'plugout'}) {
452 # If a plugout was specified in the collect.cfg file, assume it is sensible
453 # We can't check the name because it could be anything, if it is a custom plugout
454 $plugout = $collectcfg->{'plugout'};
455 }
456 else{
457 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
458 push @$plugout,"GreenstoneXMLPlugout";
459 }
460 else{
461 push @$plugout,$saveas."Plugout";
462 }
463 }
[22413]464 }
[22421]465 else {
466 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
467 $plugout = $collectcfg->{'plugout'};
[22413]468 }
469 else{
[22421]470 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
471 push @$plugout,"GreenstoneMETSPlugout";
472 }
473 else{
474 push @$plugout,$saveas."Plugout";
475 }
[22413]476 }
477 }
[22421]478
479 my $plugout_name = $plugout->[0];
[22413]480
[22421]481 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
482 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
483 push @$plugout,("-debug") if ($debug);
484 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
485 push @$plugout,("-gzip_output") if ($gzip);
486 push @$plugout,("-output_handle",$out) if (defined $out);
487
488 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
489
490 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
491 push @$plugout,("-group") if ($group_marc);
492 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
493 }
494 if ($plugout_name =~ m/^.*METSPlugout$/) {
495 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
496 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
497 }
498
499 if ($plugout_name eq "FedoraMETSPlugout") {
500 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
501 }
502
503
[22413]504 my $processor = &plugout::load_plugout($plugout);
505 $processor->setoutputdir ($archivedir);
506 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
507 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
508
509 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
510
511 if ($removeold) {
512 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
513 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
514 }
515
[23053]516 # process the import directory
517 my $block_hash = {};
518 $block_hash->{'new_files'} = {};
519 $block_hash->{'reindex_files'} = {};
520 my $metadata = {};
521
522 # gobal blocking pass may set up some metadata
523 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
524
525 if ($manifest ne "") {
526 #
527 # 1. Process delete files first
528 #
529 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
530 my @full_deleted_files = ();
531
532 # ensure all filenames are absolute
533 foreach my $df (@deleted_files) {
534 my $full_df =
535 (&util::filename_is_absolute($df))
536 ? $df
537 : &util::filename_cat($importdir,$df);
538
539 if (-d $full_df) {
540 &add_dir_contents_to_list($full_df, \@full_deleted_files);
541 } else {
542 push(@full_deleted_files,$full_df);
543 }
544 }
545
546 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
547 mark_docs_for_deletion($archive_info,{},
548 \@full_deleted_files,
549 $archivedir, $verbosity, "delete");
550
551
552 #
553 # 2. Now files for reindexing
554 #
555
556 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
557 my @full_reindex_files = ();
558 # ensure all filenames are absolute
559 foreach my $rf (@reindex_files) {
560 my $full_rf =
561 (&util::filename_is_absolute($rf))
562 ? $rf
563 : &util::filename_cat($importdir,$rf);
564
565 if (-d $full_rf) {
566 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
567 } else {
568 push(@full_reindex_files,$full_rf);
569 }
570 }
571
572 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
573 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
574
575 # And now to ensure the new version of the file processed by
576 # appropriate plugin, we need to add it to block_hash reindex list
577 foreach my $full_rf (@full_reindex_files) {
578 $block_hash->{'reindex_files'}->{$full_rf} = 1;
579 }
580
581
582 #
583 # 3. Now finally any new files - add to block_hash new_files list
584 #
585
586 my @new_files = keys %{$manifest_lookup->{'index'}};
587 my @full_new_files = ();
588
589 foreach my $nf (@new_files) {
590 # ensure filename is absolute
591 my $full_nf =
592 (&util::filename_is_absolute($nf))
593 ? $nf
594 : &util::filename_cat($importdir,$nf);
595
596 if (-d $full_nf) {
597 &add_dir_contents_to_list($full_nf, \@full_new_files);
598 } else {
599 push(@full_new_files,$full_nf);
600 }
601 }
602
[23132]603 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
604 my $arcinfodb_map = {};
605 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
[23053]606 foreach my $f (@full_new_files) {
[23132]607 # check that we haven't seen it already
608 if (defined $arcinfodb_map->{$f}) {
609 # TODO make better warning
610 print STDERR "Warning: $f already in src archive, \n";
611 } else {
612 $block_hash->{'new_files'}->{$f} = 1;
613 }
[23053]614 }
[23132]615
616 undef $arcinfodb_map;
[23053]617 }
618 else {
619 # if incremental, we read through the import folder to see whats changed.
620
[22413]621 if ($incremental || $incremental_mode eq "onlyadd") {
622 prime_doc_oid_count($archivedir);
623
624 # Can now work out which files were new, already existed, and have
625 # been deleted
626
627 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
628 $archivedir,$verbosity,$incremental_mode);
629
630 my @new_files = sort keys %{$block_hash->{'new_files'}};
631 if (scalar(@new_files>0)) {
632 print STDERR "New files and modified metadata files since last import:\n ";
633 print STDERR join("\n ",@new_files), "\n";
634 }
635
636 if ($incremental) {
637 # only look for deletions if we are truely incremental
638 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
639 # Filter out any in gsdl/tmp area
640 my @filtered_deleted_files = ();
641 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
642 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
643 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
644 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
645
646 foreach my $df (@deleted_files) {
647 next if ($df =~ m/^$gsdl_tmp_area/);
648 next if ($df =~ m/^$collect_tmp_area/);
649
650 push(@filtered_deleted_files,$df);
651 }
652
653
654 @deleted_files = @filtered_deleted_files;
655
656 if (scalar(@deleted_files)>0) {
657 print STDERR "Files deleted since last import:\n ";
658 print STDERR join("\n ",@deleted_files), "\n";
659
660
661 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
662
663 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
664 }
665
666 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
667
668 if (scalar(@reindex_files)>0) {
669 print STDERR "Files to reindex since last import:\n ";
670 print STDERR join("\n ",@reindex_files), "\n";
671 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
672 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
673 }
674
[23053]675 }
[22413]676 }
677 }
[23053]678
679 # now, whichever mode we are in, we can process the entire import folder
680 if ((defined $jobs) && ($jobs > 1))
681 {
682 # if jobs are set to >1, run in parallel using MPI helper
683 # [hs, 1 july 2010]
684 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
685 $self->{'collection'}, $self->{'site'});
686 }
[22413]687 else
688 {
[23053]689 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
[22413]690 }
[23053]691
692
[22421]693 if ($saveas eq "FedoraMETS") {
694 # create collection "doc obj" for Fedora that contains
695 # collection-level metadata
696
697 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
698 $doc_obj->set_OID("collection");
699
700 my $col_name = undef;
701 my $col_meta = $collectcfg->{'collectionmeta'};
702
703 if (defined $col_meta) {
704 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
705 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
706 }
707 $processor->process($doc_obj);
708 }
709
[22413]710 &plugin::end($pluginfo, $processor);
711
712 &plugin::deinit($pluginfo, $processor);
713
714 # Store the value of OIDCount (used in doc.pm) so it can be
715 # restored correctly to this value on an incremental build
716 store_doc_oid_count($archivedir);
717
718 # write out the archive information file
[22464]719 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
[22413]720 $processor->close_group_output() if $processor->is_group();
721
722 # for backwards compatability with archvies.inf file
723 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
724 $archive_info->save_info($arcinfo_doc_filename);
725 }
726 else {
727 $archive_info->save_revinfo_db($arcinfo_src_filename);
728 }
729
730 return $pluginfo;
731}
732
733
734sub generate_statistics
735{
736 my $self = shift @_;
[22421]737 my ($pluginfo) = @_;
[22413]738
[22421]739 my $inexport_mode = $self->{'mode'};
740
[22445]741 my $statsfile = $self->{'statsfile'};
742 my $out = $self->{'out'};
[22413]743 my $faillogname = $self->{'faillogname'};
[22445]744 my $gli = $self->{'gli'};
745 my $jobs = $self->{'jobs'};
[22413]746
747 # write out import stats
748
[22445]749 if ((!defined $jobs) || ($jobs == 1))
750 {
751 # only output statistics if there are multiple jobs
752 # [hs, 1 july 2010]
[22413]753
[22445]754 my $close_stats = 0;
755 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
756 if (open (STATS, ">$statsfile")) {
[23042]757 $statsfile = 'inexport::STATS';
[22445]758 $close_stats = 1;
759 } else {
760 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
761 &gsprintf($out, "{import.stats_backup}\n");
762 $statsfile = 'STDERR';
763 }
764 }
765
766 &gsprintf($out, "\n");
767 &gsprintf($out, "*********************************************\n");
768 &gsprintf($out, "{$inexport_mode.complete}\n");
769 &gsprintf($out, "*********************************************\n");
770
771 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
772 if ($close_stats) {
773 close STATS;
774 }
[22413]775 }
776
777 close OUT if $self->{'close_out'};
778 close FAILLOG;
779}
780
781
[22464]782sub store_collectionmeta
783{
784 my ($collectionmeta,$field,$doc_obj) = @_;
785
786 my $section = $doc_obj->get_top_section();
787
788 my $field_hash = $collectionmeta->{$field};
789
790 foreach my $k (keys %$field_hash)
791 {
792 my $val = $field_hash->{$k};
793
794 ### print STDERR "*** $k = $field_hash->{$k}\n";
795
796 my $md_label = "ex.$field";
797
798
799 if ($k =~ m/^\[l=(.*?)\]$/)
800 {
801
802 my $md_suffix = $1;
803 $md_label .= "^$md_suffix";
804 }
805
806
807 $doc_obj->add_utf8_metadata($section,$md_label, $val);
808
809 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
810 # while "collectionname" in GS2 is called "name" in GS3.
811 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
812 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
813 {
814 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
815 }
816
817 }
818}
[22413]819
820
[21306]821sub oid_count_file {
822 my ($archivedir) = @_;
823 return &util::filename_cat ($archivedir, "OIDcount");
824}
825
826
[18528]827sub prime_doc_oid_count
828{
829 my ($archivedir) = @_;
[21306]830 my $oid_count_filename = &oid_count_file($archivedir);
[18528]831
832 if (-e $oid_count_filename) {
833 if (open(OIDIN,"<$oid_count_filename")) {
834 my $OIDcount = <OIDIN>;
835 chomp $OIDcount;
836 close(OIDIN);
837
838 $doc::OIDcount = $OIDcount;
839 }
840 else {
841
842 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
843 print STDERR "Setting value to 0\n";
844 }
845 }
846
847}
848
849sub store_doc_oid_count
850{
851 # Use the file "OIDcount" in the archives directory to record
852 # what value doc.pm got up to
853
854 my ($archivedir) = @_;
[21306]855 my $oid_count_filename = &oid_count_file($archivedir);
[18528]856
857
858 if (open(OIDOUT,">$oid_count_filename")) {
859 print OIDOUT $doc::OIDcount, "\n";
860
861 close(OIDOUT);
862 }
863 else {
864 print STDERR "Warning: unable to store document OID count\n";
865 }
866}
867
868
869
[18457]870sub new_vs_old_import_diff
871{
[20578]872 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
[18457]873
[21620]874 # Get the infodbtype value for this collection from the arcinfo object
875 my $infodbtype = $archive_info->{'infodbtype'};
876
[20776]877 # in this method, we want to know if metadata files are modified or not.
[21620]878 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
[20776]879
880 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
881
[18457]882 # First convert all files to absolute form
883 # This is to support the situation where the import folder is not
884 # the default
885
886 my $prev_all_files = $archive_info->{'prev_import_filelist'};
887 my $full_prev_all_files = {};
888
889 foreach my $prev_file (keys %$prev_all_files) {
890
891 if (!&util::filename_is_absolute($prev_file)) {
892 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
893 $full_prev_all_files->{$full_prev_file} = $prev_file;
894 }
895 else {
896 $full_prev_all_files->{$prev_file} = $prev_file;
897 }
898 }
899
[18469]900
[18457]901 # Figure out which are the new files, existing files and so
902 # by implication the files from the previous import that are not
903 # there any more => mark them for deletion
904 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
905
906 my $full_curr_file = $curr_file;
907
908 # entry in 'all_files' is moved to either 'existing_files',
[20776]909 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
[18457]910
911 if (!&util::filename_is_absolute($curr_file)) {
912 # add in import dir to make absolute
913 $full_curr_file = &util::filename_cat($importdir,$curr_file);
914 }
915
[19498]916 # figure out if new file or not
[18457]917 if (defined $full_prev_all_files->{$full_curr_file}) {
[20776]918 # delete it so that only files that need deleting are left
919 delete $full_prev_all_files->{$full_curr_file};
920
921 # had it before. is it a metadata file?
922 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
[20578]923
[20776]924 # is it modified??
925 if (-M $full_curr_file < $archiveinf_timestamp) {
926 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
927 # its newer than last build
928 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
929 }
[20578]930 }
931 else {
[20776]932 if ($incremental_mode eq "all") {
933
934 # had it before
935 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
936
937 }
938 else {
939 # Warning in "onlyadd" mode, but had it before!
940 print STDERR "Warning: File $full_curr_file previously imported.\n";
941 print STDERR " Treating as new file\n";
942
943 $block_hash->{'new_files'}->{$full_curr_file} = 1;
944
945 }
[20578]946 }
947 }
948 else {
949 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
950 # the new file is the special sort of file greenstone uses
951 # to attach metadata to src documents
952 # i.e metadata.xml
953 # (but note, the filename used is not constrained in
954 # Greenstone to always be this)
[18457]955
[20776]956 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
957 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
[20578]958 }
959 else {
960 $block_hash->{'new_files'}->{$full_curr_file} = 1;
961 }
[18457]962 }
[20578]963
[18457]964
965 delete $block_hash->{'all_files'}->{$curr_file};
966 }
967
[20578]968
[21306]969
970
[20776]971 # Deal with complication of new or modified metadata files by forcing
[20578]972 # everything from this point down in the file hierarchy to
973 # be freshly imported.
974 #
975 # This may mean files that have not changed are reindexed, but does
976 # guarantee by the end of processing all new metadata is correctly
977 # associated with the relevant document(s).
978
[20776]979 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
[20578]980 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
981
982 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
[20769]983 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
984
[20578]985 # Go through existing_files, and mark anything that is contained
986 # within 'situated_dir' to be reindexed (in case some of the metadata
987 # attaches to one of these files)
988
989 my $reindex_files = [];
990
991 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
[20769]992
[20578]993 if ($existing_f =~ m/^$situated_dir/) {
994 push(@$reindex_files,$existing_f);
995 $block_hash->{'reindex_files'}->{$existing_f} = 1;
[21306]996 delete $block_hash->{'existing_files'}->{$existing_f};
[20578]997
998 }
999 }
1000
1001 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1002 # (or equivalent)
1003 $block_hash->{'new_files'}->{$new_mdf} = 1;
1004
1005 }
1006
[21306]1007 # go through remaining existing files and work out what has changed and needs to be reindexed.
1008 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1009
1010 my $reindex_files = [];
1011
1012 foreach my $existing_filename (@existing_files) {
1013 if (-M $existing_filename < $archiveinf_timestamp) {
1014 # file is newer than last build
1015
1016 my $existing_file = $existing_filename;
1017 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1018
1019 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1020 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1021
1022 print STDERR "**** Reindexing existing file: $existing_file\n";
1023
1024 push(@$reindex_files,$existing_file);
1025 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1026 }
1027
1028 }
1029
[20578]1030
[18469]1031 # By this point full_prev_all_files contains the files
1032 # mentioned in archiveinf-src.db but are not in the 'import'
1033 # folder (or whatever was specified through -importdir ...)
1034
1035 # This list can contain files that were created in the 'tmp' or
1036 # 'cache' areas (such as screen-size and thumbnail images).
[18457]1037 #
[18469]1038 # In building the final list of files to delete, we test to see if
[20578]1039 # it exists on the filesystem and if it does (unusual for a "normal"
1040 # file in import, but possible in the case of 'tmp' files),
1041 # supress it from going into the final list
[18469]1042
1043 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1044
[18457]1045 my @deleted_files = values %$full_prev_all_files;
[18469]1046 map { my $curr_file = $_;
1047 my $full_curr_file = $curr_file;
1048
1049 if (!&util::filename_is_absolute($curr_file)) {
1050 # add in import dir to make absolute
1051
1052 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1053 }
1054
1055
1056 if (!-e $full_curr_file) {
1057 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1058 }
1059 } @deleted_files;
[20578]1060
1061
1062
[18457]1063}
1064
[19498]1065
[20788]1066# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
[21306]1067# $mode is 'delete' or 'reindex'
1068sub mark_docs_for_deletion
[18457]1069{
[21306]1070 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
[18457]1071
[21306]1072 my $mode_text = "deleted from index";
1073 if ($mode eq "reindex") {
1074 $mode_text = "reindexed";
1075 }
[18457]1076
[21620]1077 # Get the infodbtype value for this collection from the arcinfo object
1078 my $infodbtype = $archive_info->{'infodbtype'};
[18457]1079
[21620]1080 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1081 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1082
[22010]1083
[18457]1084 # record files marked for deletion in arcinfo
[19498]1085 foreach my $file (@$deleted_files) {
[21564]1086 # use 'archiveinf-src' info database file to look up all the OIDs
[19789]1087 # that this file is used in (note in most cases, it's just one OID)
[18457]1088
[21620]1089 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
[21554]1090 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
[18457]1091 my $oids = $src_rec->{'oid'};
[20776]1092 my $file_record_deleted = 0;
[20788]1093
1094 # delete the src record
[22010]1095 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
[21620]1096 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
[22010]1097 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1098
1099
[18457]1100 foreach my $oid (@$oids) {
1101
[20788]1102 # find the source doc (the primary file that becomes this oid)
[21620]1103 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[21554]1104 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
[20776]1105 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1106 if (!&util::filename_is_absolute($doc_source_file)) {
1107 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1108 }
[18457]1109
[20788]1110 if ($doc_source_file ne $file) {
1111 # its an associated or metadata file
1112
[20776]1113 # mark source doc for reimport as one of its assoc files has changed or deleted
1114 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
[20788]1115
[18457]1116 }
[20788]1117 my $curr_status = $archive_info->get_status_info($oid);
1118 if (defined($curr_status) && (($curr_status ne "D"))) {
1119 if ($verbosity>1) {
1120 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
[19498]1121 }
[20788]1122 # mark oid for deletion (it will be deleted or reimported)
1123 $archive_info->set_status_info($oid,"D");
[21620]1124 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[20788]1125 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
[21557]1126
1127 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
[22010]1128 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1129
[21620]1130 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
[22010]1131 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
[19498]1132 }
[18457]1133 }
[22327]1134
[18457]1135 }
[22567]1136 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves. only in delete mode.
1137 if ($mode eq "delete") {
1138 foreach my $file (@$deleted_files) {
1139 if (defined $block_hash->{'reindex_files'}->{$file}) {
1140 delete $block_hash->{'reindex_files'}->{$file};
1141 }
[22327]1142 }
1143 }
[21560]1144
[22010]1145
[18457]1146}
1147
[23053]1148sub add_dir_contents_to_list {
[18457]1149
[23053]1150 my ($dirname, $list) = @_;
1151
1152 # Recur over directory contents.
1153 my (@dir, $subfile);
1154
1155 # find all the files in the directory
1156 if (!opendir (DIR, $dirname)) {
1157 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1158 return -1; # error in processing
1159 }
1160 @dir = readdir (DIR);
1161 closedir (DIR);
1162
1163 for (my $i = 0; $i < scalar(@dir); $i++) {
1164 my $subfile = $dir[$i];
1165 next if ($subfile =~ m/^\.\.?$/);
[23119]1166 next if ($subfile =~ /^\.svn$/);
[23053]1167 my $full_file = &util::filename_cat($dirname, $subfile);
1168 if (-d $full_file) {
1169 &add_dir_contents_to_list($full_file, $list);
1170 } else {
1171 push (@$list, $full_file);
1172 }
1173 }
1174
1175}
[18554]1176
[23053]1177
[18457]11781;
Note: See TracBrowser for help on using the repository browser.