source: main/trunk/greenstone2/perllib/inexport.pm@ 23132

Last change on this file since 23132 was 23132, checked in by kjdon, 11 years ago

for manifest files, if the user has specified Index (not Reindex) and the file already existed, then print a warning and don't process it again.

  • Property svn:executable set to *
File size: 36.3 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128sub get_collection
129{
130 my $self = shift @_;
131
132 return $self->{'collection'};
133}
134
135
136sub read_collection_cfg
137{
138 my $self = shift @_;
139 my ($collection,$options) = @_;
140
141 my $collectdir = $self->{'collectdir'};
142 my $site = $self->{'site'};
143 my $out = $self->{'out'};
144
145 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
146 &PrintUsage::print_txt_usage($options, "{import.params}");
147 die "\n";
148 }
149
150 # add collection's perllib dir into include path in
151 # case we have collection specific modules
152 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
153
154 # check that we can open the faillog
155 my $faillog = $self->{'faillog'};
156 if ($faillog eq "") {
157 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
158 }
159 open (FAILLOG, ">$faillog") ||
160 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
161
162
163 my $faillogname = $faillog;
164 $faillog = 'inexport::FAILLOG';
165 $faillog->autoflush(1);
166 $self->{'faillog'} = $faillog;
167 $self->{'faillogname'} = $faillogname;
168
169 # Read in the collection configuration file.
170 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
171 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
172
173 return ($config_filename,$collectcfg);
174}
175
176sub set_collection_options
177{
178 my $self = shift @_;
179 my ($collectcfg) = @_;
180
181 my $inexport_mode = $self->{'mode'};
182
183 my $verbosity = $self->{'verbosity'};
184 my $debug = $self->{'debug'};
185 my $importdir = $self->{'importdir'};
186 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
187 my $out = $self->{'out'};
188
189 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
190 if (!defined($collectcfg->{'infodbtype'}))
191 {
192 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
193 }
194
195 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
196 $importdir = $collectcfg->{'importdir'};
197 }
198 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
199 $archivedir = $collectcfg->{'archivedir'};
200 }
201 # fill in the default import and archives directories if none
202 # were supplied, turn all \ into / and remove trailing /
203 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
204 $importdir =~ s/[\\\/]+/\//g;
205 $importdir =~ s/\/$//;
206 if (!-e $importdir) {
207 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
208 die "\n";
209 }
210 $self->{'importdir'} = $importdir;
211
212 if ($archivedir eq "") {
213 if ($inexport_mode eq "import") {
214 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
215 }
216 elsif ($inexport_mode eq "export") {
217 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
218 }
219 else {
220 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
221 print STDERR " Defaulting to 'archives' for file output\n";
222 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
223 }
224 }
225
226 $archivedir =~ s/[\\\/]+/\//g;
227 $archivedir =~ s/\/$//;
228 $self->{'archivedir'} = $archivedir;
229
230 if ($verbosity !~ /\d+/) {
231 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
232 $verbosity = $collectcfg->{'verbosity'};
233 } else {
234 $verbosity = 2; # the default
235 }
236 }
237 $self->{'verbosity'} = $verbosity;
238
239 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
240 $self->{'manifest'} = $collectcfg->{'manifest'};
241 }
242
243 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
244 if ($collectcfg->{'gzip'} =~ /^true$/i) {
245 $self->{'gzip'} = 1;
246 }
247 }
248
249 if ($self->{'maxdocs'} !~ /\-?\d+/) {
250 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
251 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
252 } else {
253 $self->{'maxdocs'} = -1; # the default
254 }
255 }
256
257 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
258 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
259 $self->{'groupsize'} = $collectcfg->{'groupsize'};
260 }
261 }
262
263 if (!defined $self->{'OIDtype'}
264 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
265 if (defined $collectcfg->{'OIDtype'}
266 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
267 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
268 } else {
269 $self->{'OIDtype'} = "hash"; # the default
270 }
271 }
272
273 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
274 if (defined $collectcfg->{'OIDmetadata'}) {
275 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
276 } else {
277 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
278 }
279 }
280
281 my $sortmeta = $self->{'sortmeta'};
282 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
283 $sortmeta = $collectcfg->{'sortmeta'};
284 }
285 # sortmeta cannot be used with group size
286 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
287 if (defined $sortmeta && $self->{'groupsize'} > 1) {
288 &gsprintf($out, "{import.cannot_sort}\n\n");
289 $sortmeta = undef;
290 }
291 $self->{'sortmeta'} = $sortmeta;
292
293 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
294 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
295 }
296
297 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
298 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
299 }
300 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
301 $self->{'debug'} = 1;
302 }
303 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
304 $self->{'gli'} = 1;
305 }
306 $self->{'gli'} = 0 unless defined $self->{'gli'};
307
308 # check keepold and removeold
309 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
310
311 my ($removeold, $keepold, $incremental, $incremental_mode)
312 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
313 $self->{'incremental'}, $checkdir,
314 $collectcfg);
315
316 $self->{'removeold'} = $removeold;
317 $self->{'keepold'} = $keepold;
318 $self->{'incremental'} = $incremental;
319 $self->{'incremental_mode'} = $incremental_mode;
320}
321
322sub process_files
323{
324 my $self = shift @_;
325 my ($config_filename,$collectcfg) = @_;
326
327 my $inexport_mode = $self->{'mode'};
328
329 my $verbosity = $self->{'verbosity'};
330 my $debug = $self->{'debug'};
331
332 my $importdir = $self->{'importdir'};
333 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
334
335 my $incremental = $self->{'incremental'};
336 my $incremental_mode = $self->{'incremental_mode'};
337
338 my $removeold = $self->{'removeold'};
339 my $keepold = $self->{'keepold'};
340
341 my $saveas = $self->{'saveas'};
342 my $OIDtype = $self->{'OIDtype'};
343 my $OIDmetadata = $self->{'OIDmetadata'};
344
345 my $out = $self->{'out'};
346 my $faillog = $self->{'faillog'};
347
348 my $maxdocs = $self->{'maxdocs'};
349 my $gzip = $self->{'gzip'};
350 my $groupsize = $self->{'groupsize'};
351 my $sortmeta = $self->{'sortmeta'};
352
353 my $removeprefix = $self->{'removeprefix'};
354 my $removesuffix = $self->{'removesuffix'};
355
356 my $gli = $self->{'gli'};
357
358 my $jobs = $self->{'jobs'};
359 my $epoch = $self->{'epoch'};
360
361 # related to export
362 my $xsltfile = $self->{'xsltfile'};
363 my $group_marc = $self->{'group_marc'};
364 my $mapping_file = $self->{'mapping_file'};
365 my $xslt_mets = $self->{'xslt_mets'};
366 my $xslt_txt = $self->{'xslt_txt'};
367 my $fedora_namespace = $self->{'fedora_namespace'};
368
369 if ($inexport_mode eq "import") {
370 print STDERR "<Import>\n" if $gli;
371 }
372 else {
373 print STDERR "<export>\n" if $gli;
374 }
375
376 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
377 if ($self->{'manifest'} ne "") {
378 my $manifest_filename = $self->{'manifest'};
379
380 if (!&util::filename_is_absolute($manifest_filename)) {
381 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
382 }
383
384 $self->{'manifest'} =~ s/[\\\/]+/\//g;
385 $self->{'manifest'} =~ s/\/$//;
386
387 $manifest_lookup->parse($manifest_filename);
388 }
389
390 my $manifest = $self->{'manifest'};
391
392 # load all the plugins
393 my $plugins = [];
394 if (defined $collectcfg->{'plugin'}) {
395 $plugins = $collectcfg->{'plugin'};
396 }
397
398 my $plugin_incr_mode = $incremental_mode;
399 if ($manifest ne "") {
400 # if we have a manifest file, then we pretend we are fully incremental for plugins
401 $plugin_incr_mode = "all";
402 }
403 #some global options for the plugins
404 my @global_opts = ();
405
406 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
407 if (scalar(@$pluginfo) == 0) {
408 &gsprintf($out, "{import.no_plugins_loaded}\n");
409 die "\n";
410 }
411
412 # remove the old contents of the archives directory (and tmp directory) if needed
413 if ($removeold) {
414 if (-e $archivedir) {
415 &gsprintf($out, "{import.removing_archives}\n");
416 &util::rm_r ($archivedir);
417 }
418 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
419 $tmpdir =~ s/[\\\/]+/\//g;
420 $tmpdir =~ s/\/$//;
421 if (-e $tmpdir) {
422 &gsprintf($out, "{import.removing_tmpdir}\n");
423 &util::rm_r ($tmpdir);
424 }
425 }
426
427 # create the archives dir if needed
428 &util::mk_all_dir($archivedir);
429
430 # read the archive information file
431
432 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
433 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
434 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
435
436 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
437 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
438
439 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
440 $archive_info->load_info ($arcinfo_doc_filename);
441
442 if ($manifest eq "") {
443 # Load in list of files in import folder from last import (if present)
444 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
445 }
446
447 ####Use Plugout####
448 my $plugout;
449
450 if ($inexport_mode eq "import") {
451 if (defined $collectcfg->{'plugout'}) {
452 # If a plugout was specified in the collect.cfg file, assume it is sensible
453 # We can't check the name because it could be anything, if it is a custom plugout
454 $plugout = $collectcfg->{'plugout'};
455 }
456 else{
457 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
458 push @$plugout,"GreenstoneXMLPlugout";
459 }
460 else{
461 push @$plugout,$saveas."Plugout";
462 }
463 }
464 }
465 else {
466 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
467 $plugout = $collectcfg->{'plugout'};
468 }
469 else{
470 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
471 push @$plugout,"GreenstoneMETSPlugout";
472 }
473 else{
474 push @$plugout,$saveas."Plugout";
475 }
476 }
477 }
478
479 my $plugout_name = $plugout->[0];
480
481 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
482 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
483 push @$plugout,("-debug") if ($debug);
484 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
485 push @$plugout,("-gzip_output") if ($gzip);
486 push @$plugout,("-output_handle",$out) if (defined $out);
487
488 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
489
490 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
491 push @$plugout,("-group") if ($group_marc);
492 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
493 }
494 if ($plugout_name =~ m/^.*METSPlugout$/) {
495 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
496 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
497 }
498
499 if ($plugout_name eq "FedoraMETSPlugout") {
500 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
501 }
502
503
504 my $processor = &plugout::load_plugout($plugout);
505 $processor->setoutputdir ($archivedir);
506 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
507 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
508
509 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
510
511 if ($removeold) {
512 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
513 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
514 }
515
516 # process the import directory
517 my $block_hash = {};
518 $block_hash->{'new_files'} = {};
519 $block_hash->{'reindex_files'} = {};
520 my $metadata = {};
521
522 # gobal blocking pass may set up some metadata
523 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
524
525 if ($manifest ne "") {
526 #
527 # 1. Process delete files first
528 #
529 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
530 my @full_deleted_files = ();
531
532 # ensure all filenames are absolute
533 foreach my $df (@deleted_files) {
534 my $full_df =
535 (&util::filename_is_absolute($df))
536 ? $df
537 : &util::filename_cat($importdir,$df);
538
539 if (-d $full_df) {
540 &add_dir_contents_to_list($full_df, \@full_deleted_files);
541 } else {
542 push(@full_deleted_files,$full_df);
543 }
544 }
545
546 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
547 mark_docs_for_deletion($archive_info,{},
548 \@full_deleted_files,
549 $archivedir, $verbosity, "delete");
550
551
552 #
553 # 2. Now files for reindexing
554 #
555
556 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
557 my @full_reindex_files = ();
558 # ensure all filenames are absolute
559 foreach my $rf (@reindex_files) {
560 my $full_rf =
561 (&util::filename_is_absolute($rf))
562 ? $rf
563 : &util::filename_cat($importdir,$rf);
564
565 if (-d $full_rf) {
566 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
567 } else {
568 push(@full_reindex_files,$full_rf);
569 }
570 }
571
572 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
573 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
574
575 # And now to ensure the new version of the file processed by
576 # appropriate plugin, we need to add it to block_hash reindex list
577 foreach my $full_rf (@full_reindex_files) {
578 $block_hash->{'reindex_files'}->{$full_rf} = 1;
579 }
580
581
582 #
583 # 3. Now finally any new files - add to block_hash new_files list
584 #
585
586 my @new_files = keys %{$manifest_lookup->{'index'}};
587 my @full_new_files = ();
588
589 foreach my $nf (@new_files) {
590 # ensure filename is absolute
591 my $full_nf =
592 (&util::filename_is_absolute($nf))
593 ? $nf
594 : &util::filename_cat($importdir,$nf);
595
596 if (-d $full_nf) {
597 &add_dir_contents_to_list($full_nf, \@full_new_files);
598 } else {
599 push(@full_new_files,$full_nf);
600 }
601 }
602
603 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
604 my $arcinfodb_map = {};
605 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
606 foreach my $f (@full_new_files) {
607 # check that we haven't seen it already
608 if (defined $arcinfodb_map->{$f}) {
609 # TODO make better warning
610 print STDERR "Warning: $f already in src archive, \n";
611 } else {
612 $block_hash->{'new_files'}->{$f} = 1;
613 }
614 }
615
616 undef $arcinfodb_map;
617 }
618 else {
619 # if incremental, we read through the import folder to see whats changed.
620
621 if ($incremental || $incremental_mode eq "onlyadd") {
622 prime_doc_oid_count($archivedir);
623
624 # Can now work out which files were new, already existed, and have
625 # been deleted
626
627 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
628 $archivedir,$verbosity,$incremental_mode);
629
630 my @new_files = sort keys %{$block_hash->{'new_files'}};
631 if (scalar(@new_files>0)) {
632 print STDERR "New files and modified metadata files since last import:\n ";
633 print STDERR join("\n ",@new_files), "\n";
634 }
635
636 if ($incremental) {
637 # only look for deletions if we are truely incremental
638 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
639 # Filter out any in gsdl/tmp area
640 my @filtered_deleted_files = ();
641 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
642 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
643 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
644 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
645
646 foreach my $df (@deleted_files) {
647 next if ($df =~ m/^$gsdl_tmp_area/);
648 next if ($df =~ m/^$collect_tmp_area/);
649
650 push(@filtered_deleted_files,$df);
651 }
652
653
654 @deleted_files = @filtered_deleted_files;
655
656 if (scalar(@deleted_files)>0) {
657 print STDERR "Files deleted since last import:\n ";
658 print STDERR join("\n ",@deleted_files), "\n";
659
660
661 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
662
663 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
664 }
665
666 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
667
668 if (scalar(@reindex_files)>0) {
669 print STDERR "Files to reindex since last import:\n ";
670 print STDERR join("\n ",@reindex_files), "\n";
671 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
672 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
673 }
674
675 }
676 }
677 }
678
679 # now, whichever mode we are in, we can process the entire import folder
680 if ((defined $jobs) && ($jobs > 1))
681 {
682 # if jobs are set to >1, run in parallel using MPI helper
683 # [hs, 1 july 2010]
684 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
685 $self->{'collection'}, $self->{'site'});
686 }
687 else
688 {
689 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
690 }
691
692
693 if ($saveas eq "FedoraMETS") {
694 # create collection "doc obj" for Fedora that contains
695 # collection-level metadata
696
697 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
698 $doc_obj->set_OID("collection");
699
700 my $col_name = undef;
701 my $col_meta = $collectcfg->{'collectionmeta'};
702
703 if (defined $col_meta) {
704 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
705 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
706 }
707 $processor->process($doc_obj);
708 }
709
710 &plugin::end($pluginfo, $processor);
711
712 &plugin::deinit($pluginfo, $processor);
713
714 # Store the value of OIDCount (used in doc.pm) so it can be
715 # restored correctly to this value on an incremental build
716 store_doc_oid_count($archivedir);
717
718 # write out the archive information file
719 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
720 $processor->close_group_output() if $processor->is_group();
721
722 # for backwards compatability with archvies.inf file
723 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
724 $archive_info->save_info($arcinfo_doc_filename);
725 }
726 else {
727 $archive_info->save_revinfo_db($arcinfo_src_filename);
728 }
729
730 return $pluginfo;
731}
732
733
734sub generate_statistics
735{
736 my $self = shift @_;
737 my ($pluginfo) = @_;
738
739 my $inexport_mode = $self->{'mode'};
740
741 my $statsfile = $self->{'statsfile'};
742 my $out = $self->{'out'};
743 my $faillogname = $self->{'faillogname'};
744 my $gli = $self->{'gli'};
745 my $jobs = $self->{'jobs'};
746
747 # write out import stats
748
749 if ((!defined $jobs) || ($jobs == 1))
750 {
751 # only output statistics if there are multiple jobs
752 # [hs, 1 july 2010]
753
754 my $close_stats = 0;
755 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
756 if (open (STATS, ">$statsfile")) {
757 $statsfile = 'inexport::STATS';
758 $close_stats = 1;
759 } else {
760 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
761 &gsprintf($out, "{import.stats_backup}\n");
762 $statsfile = 'STDERR';
763 }
764 }
765
766 &gsprintf($out, "\n");
767 &gsprintf($out, "*********************************************\n");
768 &gsprintf($out, "{$inexport_mode.complete}\n");
769 &gsprintf($out, "*********************************************\n");
770
771 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
772 if ($close_stats) {
773 close STATS;
774 }
775 }
776
777 close OUT if $self->{'close_out'};
778 close FAILLOG;
779}
780
781
782sub store_collectionmeta
783{
784 my ($collectionmeta,$field,$doc_obj) = @_;
785
786 my $section = $doc_obj->get_top_section();
787
788 my $field_hash = $collectionmeta->{$field};
789
790 foreach my $k (keys %$field_hash)
791 {
792 my $val = $field_hash->{$k};
793
794 ### print STDERR "*** $k = $field_hash->{$k}\n";
795
796 my $md_label = "ex.$field";
797
798
799 if ($k =~ m/^\[l=(.*?)\]$/)
800 {
801
802 my $md_suffix = $1;
803 $md_label .= "^$md_suffix";
804 }
805
806
807 $doc_obj->add_utf8_metadata($section,$md_label, $val);
808
809 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
810 # while "collectionname" in GS2 is called "name" in GS3.
811 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
812 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
813 {
814 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
815 }
816
817 }
818}
819
820
821sub oid_count_file {
822 my ($archivedir) = @_;
823 return &util::filename_cat ($archivedir, "OIDcount");
824}
825
826
827sub prime_doc_oid_count
828{
829 my ($archivedir) = @_;
830 my $oid_count_filename = &oid_count_file($archivedir);
831
832 if (-e $oid_count_filename) {
833 if (open(OIDIN,"<$oid_count_filename")) {
834 my $OIDcount = <OIDIN>;
835 chomp $OIDcount;
836 close(OIDIN);
837
838 $doc::OIDcount = $OIDcount;
839 }
840 else {
841
842 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
843 print STDERR "Setting value to 0\n";
844 }
845 }
846
847}
848
849sub store_doc_oid_count
850{
851 # Use the file "OIDcount" in the archives directory to record
852 # what value doc.pm got up to
853
854 my ($archivedir) = @_;
855 my $oid_count_filename = &oid_count_file($archivedir);
856
857
858 if (open(OIDOUT,">$oid_count_filename")) {
859 print OIDOUT $doc::OIDcount, "\n";
860
861 close(OIDOUT);
862 }
863 else {
864 print STDERR "Warning: unable to store document OID count\n";
865 }
866}
867
868
869
870sub new_vs_old_import_diff
871{
872 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
873
874 # Get the infodbtype value for this collection from the arcinfo object
875 my $infodbtype = $archive_info->{'infodbtype'};
876
877 # in this method, we want to know if metadata files are modified or not.
878 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
879
880 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
881
882 # First convert all files to absolute form
883 # This is to support the situation where the import folder is not
884 # the default
885
886 my $prev_all_files = $archive_info->{'prev_import_filelist'};
887 my $full_prev_all_files = {};
888
889 foreach my $prev_file (keys %$prev_all_files) {
890
891 if (!&util::filename_is_absolute($prev_file)) {
892 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
893 $full_prev_all_files->{$full_prev_file} = $prev_file;
894 }
895 else {
896 $full_prev_all_files->{$prev_file} = $prev_file;
897 }
898 }
899
900
901 # Figure out which are the new files, existing files and so
902 # by implication the files from the previous import that are not
903 # there any more => mark them for deletion
904 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
905
906 my $full_curr_file = $curr_file;
907
908 # entry in 'all_files' is moved to either 'existing_files',
909 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
910
911 if (!&util::filename_is_absolute($curr_file)) {
912 # add in import dir to make absolute
913 $full_curr_file = &util::filename_cat($importdir,$curr_file);
914 }
915
916 # figure out if new file or not
917 if (defined $full_prev_all_files->{$full_curr_file}) {
918 # delete it so that only files that need deleting are left
919 delete $full_prev_all_files->{$full_curr_file};
920
921 # had it before. is it a metadata file?
922 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
923
924 # is it modified??
925 if (-M $full_curr_file < $archiveinf_timestamp) {
926 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
927 # its newer than last build
928 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
929 }
930 }
931 else {
932 if ($incremental_mode eq "all") {
933
934 # had it before
935 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
936
937 }
938 else {
939 # Warning in "onlyadd" mode, but had it before!
940 print STDERR "Warning: File $full_curr_file previously imported.\n";
941 print STDERR " Treating as new file\n";
942
943 $block_hash->{'new_files'}->{$full_curr_file} = 1;
944
945 }
946 }
947 }
948 else {
949 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
950 # the new file is the special sort of file greenstone uses
951 # to attach metadata to src documents
952 # i.e metadata.xml
953 # (but note, the filename used is not constrained in
954 # Greenstone to always be this)
955
956 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
957 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
958 }
959 else {
960 $block_hash->{'new_files'}->{$full_curr_file} = 1;
961 }
962 }
963
964
965 delete $block_hash->{'all_files'}->{$curr_file};
966 }
967
968
969
970
971 # Deal with complication of new or modified metadata files by forcing
972 # everything from this point down in the file hierarchy to
973 # be freshly imported.
974 #
975 # This may mean files that have not changed are reindexed, but does
976 # guarantee by the end of processing all new metadata is correctly
977 # associated with the relevant document(s).
978
979 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
980 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
981
982 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
983 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
984
985 # Go through existing_files, and mark anything that is contained
986 # within 'situated_dir' to be reindexed (in case some of the metadata
987 # attaches to one of these files)
988
989 my $reindex_files = [];
990
991 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
992
993 if ($existing_f =~ m/^$situated_dir/) {
994 push(@$reindex_files,$existing_f);
995 $block_hash->{'reindex_files'}->{$existing_f} = 1;
996 delete $block_hash->{'existing_files'}->{$existing_f};
997
998 }
999 }
1000
1001 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1002 # (or equivalent)
1003 $block_hash->{'new_files'}->{$new_mdf} = 1;
1004
1005 }
1006
1007 # go through remaining existing files and work out what has changed and needs to be reindexed.
1008 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1009
1010 my $reindex_files = [];
1011
1012 foreach my $existing_filename (@existing_files) {
1013 if (-M $existing_filename < $archiveinf_timestamp) {
1014 # file is newer than last build
1015
1016 my $existing_file = $existing_filename;
1017 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1018
1019 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1020 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1021
1022 print STDERR "**** Reindexing existing file: $existing_file\n";
1023
1024 push(@$reindex_files,$existing_file);
1025 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1026 }
1027
1028 }
1029
1030
1031 # By this point full_prev_all_files contains the files
1032 # mentioned in archiveinf-src.db but are not in the 'import'
1033 # folder (or whatever was specified through -importdir ...)
1034
1035 # This list can contain files that were created in the 'tmp' or
1036 # 'cache' areas (such as screen-size and thumbnail images).
1037 #
1038 # In building the final list of files to delete, we test to see if
1039 # it exists on the filesystem and if it does (unusual for a "normal"
1040 # file in import, but possible in the case of 'tmp' files),
1041 # supress it from going into the final list
1042
1043 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1044
1045 my @deleted_files = values %$full_prev_all_files;
1046 map { my $curr_file = $_;
1047 my $full_curr_file = $curr_file;
1048
1049 if (!&util::filename_is_absolute($curr_file)) {
1050 # add in import dir to make absolute
1051
1052 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1053 }
1054
1055
1056 if (!-e $full_curr_file) {
1057 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1058 }
1059 } @deleted_files;
1060
1061
1062
1063}
1064
1065
1066# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1067# $mode is 'delete' or 'reindex'
1068sub mark_docs_for_deletion
1069{
1070 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1071
1072 my $mode_text = "deleted from index";
1073 if ($mode eq "reindex") {
1074 $mode_text = "reindexed";
1075 }
1076
1077 # Get the infodbtype value for this collection from the arcinfo object
1078 my $infodbtype = $archive_info->{'infodbtype'};
1079
1080 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1081 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1082
1083
1084 # record files marked for deletion in arcinfo
1085 foreach my $file (@$deleted_files) {
1086 # use 'archiveinf-src' info database file to look up all the OIDs
1087 # that this file is used in (note in most cases, it's just one OID)
1088
1089 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1090 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
1091 my $oids = $src_rec->{'oid'};
1092 my $file_record_deleted = 0;
1093
1094 # delete the src record
1095 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1096 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1097 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1098
1099
1100 foreach my $oid (@$oids) {
1101
1102 # find the source doc (the primary file that becomes this oid)
1103 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1104 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
1105 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1106 if (!&util::filename_is_absolute($doc_source_file)) {
1107 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1108 }
1109
1110 if ($doc_source_file ne $file) {
1111 # its an associated or metadata file
1112
1113 # mark source doc for reimport as one of its assoc files has changed or deleted
1114 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1115
1116 }
1117 my $curr_status = $archive_info->get_status_info($oid);
1118 if (defined($curr_status) && (($curr_status ne "D"))) {
1119 if ($verbosity>1) {
1120 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1121 }
1122 # mark oid for deletion (it will be deleted or reimported)
1123 $archive_info->set_status_info($oid,"D");
1124 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1125 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1126
1127 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1128 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1129
1130 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1131 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1132 }
1133 }
1134
1135 }
1136 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves. only in delete mode.
1137 if ($mode eq "delete") {
1138 foreach my $file (@$deleted_files) {
1139 if (defined $block_hash->{'reindex_files'}->{$file}) {
1140 delete $block_hash->{'reindex_files'}->{$file};
1141 }
1142 }
1143 }
1144
1145
1146}
1147
1148sub add_dir_contents_to_list {
1149
1150 my ($dirname, $list) = @_;
1151
1152 # Recur over directory contents.
1153 my (@dir, $subfile);
1154
1155 # find all the files in the directory
1156 if (!opendir (DIR, $dirname)) {
1157 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1158 return -1; # error in processing
1159 }
1160 @dir = readdir (DIR);
1161 closedir (DIR);
1162
1163 for (my $i = 0; $i < scalar(@dir); $i++) {
1164 my $subfile = $dir[$i];
1165 next if ($subfile =~ m/^\.\.?$/);
1166 next if ($subfile =~ /^\.svn$/);
1167 my $full_file = &util::filename_cat($dirname, $subfile);
1168 if (-d $full_file) {
1169 &add_dir_contents_to_list($full_file, $list);
1170 } else {
1171 push (@$list, $full_file);
1172 }
1173 }
1174
1175}
1176
1177
11781;
Note: See TracBrowser for help on using the repository browser.