source: main/trunk/greenstone2/perllib/inexport.pm@ 23767

Last change on this file since 23767 was 23767, checked in by davidb, 13 years ago

CGI Perl scripts updated to work with Greenstone 3. This in turn required these supporting routines to optionally handle the 'site' variable

  • Property svn:executable set to *
File size: 37.0 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # add collection's perllib dir into include path in
176 # case we have collection specific modules
177 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
178
179 # check that we can open the faillog
180 my $faillog = $self->{'faillog'};
181 if ($faillog eq "") {
182 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
183 }
184 open (FAILLOG, ">$faillog") ||
185 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
186
187
188 my $faillogname = $faillog;
189 $faillog = 'inexport::FAILLOG';
190 $faillog->autoflush(1);
191 $self->{'faillog'} = $faillog;
192 $self->{'faillogname'} = $faillogname;
193
194 # Read in the collection configuration file.
195 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
196 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
197
198 return ($config_filename,$collectcfg);
199}
200
201sub set_collection_options
202{
203 my $self = shift @_;
204 my ($collectcfg) = @_;
205
206 my $inexport_mode = $self->{'mode'};
207
208 my $verbosity = $self->{'verbosity'};
209 my $debug = $self->{'debug'};
210 my $importdir = $self->{'importdir'};
211 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
212 my $out = $self->{'out'};
213
214 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
215 if (!defined($collectcfg->{'infodbtype'}))
216 {
217 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
218 }
219 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
220 # we can't use the text version for archives dbs.
221 $collectcfg->{'infodbtype'} = "gdbm";
222 }
223 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
224 $importdir = $collectcfg->{'importdir'};
225 }
226 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
227 $archivedir = $collectcfg->{'archivedir'};
228 }
229 # fill in the default import and archives directories if none
230 # were supplied, turn all \ into / and remove trailing /
231 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
232 $importdir =~ s/[\\\/]+/\//g;
233 $importdir =~ s/\/$//;
234 if (!-e $importdir) {
235 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
236 die "\n";
237 }
238 $self->{'importdir'} = $importdir;
239
240 if ($archivedir eq "") {
241 if ($inexport_mode eq "import") {
242 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
243 }
244 elsif ($inexport_mode eq "export") {
245 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
246 }
247 else {
248 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
249 print STDERR " Defaulting to 'archives' for file output\n";
250 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
251 }
252 }
253
254 $archivedir =~ s/[\\\/]+/\//g;
255 $archivedir =~ s/\/$//;
256 $self->{'archivedir'} = $archivedir;
257
258 if ($verbosity !~ /\d+/) {
259 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
260 $verbosity = $collectcfg->{'verbosity'};
261 } else {
262 $verbosity = 2; # the default
263 }
264 }
265 $self->{'verbosity'} = $verbosity;
266
267 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
268 $self->{'manifest'} = $collectcfg->{'manifest'};
269 }
270
271 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
272 if ($collectcfg->{'gzip'} =~ /^true$/i) {
273 $self->{'gzip'} = 1;
274 }
275 }
276
277 if ($self->{'maxdocs'} !~ /\-?\d+/) {
278 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
279 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
280 } else {
281 $self->{'maxdocs'} = -1; # the default
282 }
283 }
284
285 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
286 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
287 $self->{'groupsize'} = $collectcfg->{'groupsize'};
288 }
289 }
290
291 if (!defined $self->{'OIDtype'}
292 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
293 if (defined $collectcfg->{'OIDtype'}
294 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
295 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
296 } else {
297 $self->{'OIDtype'} = "hash"; # the default
298 }
299 }
300
301 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
302 if (defined $collectcfg->{'OIDmetadata'}) {
303 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
304 } else {
305 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
306 }
307 }
308
309 my $sortmeta = $self->{'sortmeta'};
310 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
311 $sortmeta = $collectcfg->{'sortmeta'};
312 }
313 # sortmeta cannot be used with group size
314 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
315 if (defined $sortmeta && $self->{'groupsize'} > 1) {
316 &gsprintf($out, "{import.cannot_sort}\n\n");
317 $sortmeta = undef;
318 }
319 $self->{'sortmeta'} = $sortmeta;
320
321 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
322 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
323 }
324
325 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
326 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
327 }
328 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
329 $self->{'debug'} = 1;
330 }
331 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
332 $self->{'gli'} = 1;
333 }
334 $self->{'gli'} = 0 unless defined $self->{'gli'};
335
336 # check keepold and removeold
337 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
338
339 my ($removeold, $keepold, $incremental, $incremental_mode)
340 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
341 $self->{'incremental'}, $checkdir,
342 $collectcfg);
343
344 $self->{'removeold'} = $removeold;
345 $self->{'keepold'} = $keepold;
346 $self->{'incremental'} = $incremental;
347 $self->{'incremental_mode'} = $incremental_mode;
348}
349
350sub process_files
351{
352 my $self = shift @_;
353 my ($config_filename,$collectcfg) = @_;
354
355 my $inexport_mode = $self->{'mode'};
356
357 my $verbosity = $self->{'verbosity'};
358 my $debug = $self->{'debug'};
359
360 my $importdir = $self->{'importdir'};
361 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
362
363 my $incremental = $self->{'incremental'};
364 my $incremental_mode = $self->{'incremental_mode'};
365
366 my $removeold = $self->{'removeold'};
367 my $keepold = $self->{'keepold'};
368
369 my $saveas = $self->{'saveas'};
370 my $OIDtype = $self->{'OIDtype'};
371 my $OIDmetadata = $self->{'OIDmetadata'};
372
373 my $out = $self->{'out'};
374 my $faillog = $self->{'faillog'};
375
376 my $maxdocs = $self->{'maxdocs'};
377 my $gzip = $self->{'gzip'};
378 my $groupsize = $self->{'groupsize'};
379 my $sortmeta = $self->{'sortmeta'};
380
381 my $removeprefix = $self->{'removeprefix'};
382 my $removesuffix = $self->{'removesuffix'};
383
384 my $gli = $self->{'gli'};
385
386 my $jobs = $self->{'jobs'};
387 my $epoch = $self->{'epoch'};
388
389 # related to export
390 my $xsltfile = $self->{'xsltfile'};
391 my $group_marc = $self->{'group_marc'};
392 my $mapping_file = $self->{'mapping_file'};
393 my $xslt_mets = $self->{'xslt_mets'};
394 my $xslt_txt = $self->{'xslt_txt'};
395 my $fedora_namespace = $self->{'fedora_namespace'};
396
397 if ($inexport_mode eq "import") {
398 print STDERR "<Import>\n" if $gli;
399 }
400 else {
401 print STDERR "<export>\n" if $gli;
402 }
403
404 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
405 if ($self->{'manifest'} ne "") {
406 my $manifest_filename = $self->{'manifest'};
407
408 if (!&util::filename_is_absolute($manifest_filename)) {
409 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
410 }
411
412 $self->{'manifest'} =~ s/[\\\/]+/\//g;
413 $self->{'manifest'} =~ s/\/$//;
414
415 $manifest_lookup->parse($manifest_filename);
416 }
417
418 my $manifest = $self->{'manifest'};
419
420 # load all the plugins
421 my $plugins = [];
422 if (defined $collectcfg->{'plugin'}) {
423 $plugins = $collectcfg->{'plugin'};
424 }
425
426 my $plugin_incr_mode = $incremental_mode;
427 if ($manifest ne "") {
428 # if we have a manifest file, then we pretend we are fully incremental for plugins
429 $plugin_incr_mode = "all";
430 }
431 #some global options for the plugins
432 my @global_opts = ();
433
434 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
435 if (scalar(@$pluginfo) == 0) {
436 &gsprintf($out, "{import.no_plugins_loaded}\n");
437 die "\n";
438 }
439
440 # remove the old contents of the archives directory (and tmp directory) if needed
441 if ($removeold) {
442 if (-e $archivedir) {
443 &gsprintf($out, "{import.removing_archives}\n");
444 &util::rm_r ($archivedir);
445 }
446 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
447 $tmpdir =~ s/[\\\/]+/\//g;
448 $tmpdir =~ s/\/$//;
449 if (-e $tmpdir) {
450 &gsprintf($out, "{import.removing_tmpdir}\n");
451 &util::rm_r ($tmpdir);
452 }
453 }
454
455 # create the archives dir if needed
456 &util::mk_all_dir($archivedir);
457
458 # read the archive information file
459
460 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
461 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
462 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
463
464 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
465 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
466
467 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
468 $archive_info->load_info ($arcinfo_doc_filename);
469
470 if ($manifest eq "") {
471 # Load in list of files in import folder from last import (if present)
472 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
473 }
474
475 ####Use Plugout####
476 my $plugout;
477
478 if ($inexport_mode eq "import") {
479 if (defined $collectcfg->{'plugout'}) {
480 # If a plugout was specified in the collect.cfg file, assume it is sensible
481 # We can't check the name because it could be anything, if it is a custom plugout
482 $plugout = $collectcfg->{'plugout'};
483 }
484 else{
485 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
486 push @$plugout,"GreenstoneXMLPlugout";
487 }
488 else{
489 push @$plugout,$saveas."Plugout";
490 }
491 }
492 }
493 else {
494 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
495 $plugout = $collectcfg->{'plugout'};
496 }
497 else{
498 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
499 push @$plugout,"GreenstoneMETSPlugout";
500 }
501 else{
502 push @$plugout,$saveas."Plugout";
503 }
504 }
505 }
506
507 my $plugout_name = $plugout->[0];
508
509 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
510 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
511 push @$plugout,("-debug") if ($debug);
512 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
513 push @$plugout,("-gzip_output") if ($gzip);
514 push @$plugout,("-output_handle",$out) if (defined $out);
515
516 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
517
518 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
519 push @$plugout,("-group") if ($group_marc);
520 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
521 }
522 if ($plugout_name =~ m/^.*METSPlugout$/) {
523 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
524 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
525 }
526
527 if ($plugout_name eq "FedoraMETSPlugout") {
528 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
529 }
530
531
532 my $processor = &plugout::load_plugout($plugout);
533 $processor->setoutputdir ($archivedir);
534 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
535 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
536
537 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
538
539 if ($removeold) {
540 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
541 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
542 }
543
544 # process the import directory
545 my $block_hash = {};
546 $block_hash->{'new_files'} = {};
547 $block_hash->{'reindex_files'} = {};
548 my $metadata = {};
549
550 # gobal blocking pass may set up some metadata
551 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
552
553 if ($manifest ne "") {
554 #
555 # 1. Process delete files first
556 #
557 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
558 my @full_deleted_files = ();
559
560 # ensure all filenames are absolute
561 foreach my $df (@deleted_files) {
562 my $full_df =
563 (&util::filename_is_absolute($df))
564 ? $df
565 : &util::filename_cat($importdir,$df);
566
567 if (-d $full_df) {
568 &add_dir_contents_to_list($full_df, \@full_deleted_files);
569 } else {
570 push(@full_deleted_files,$full_df);
571 }
572 }
573
574 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
575 mark_docs_for_deletion($archive_info,{},
576 \@full_deleted_files,
577 $archivedir, $verbosity, "delete");
578
579
580 #
581 # 2. Now files for reindexing
582 #
583
584 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
585 my @full_reindex_files = ();
586 # ensure all filenames are absolute
587 foreach my $rf (@reindex_files) {
588 my $full_rf =
589 (&util::filename_is_absolute($rf))
590 ? $rf
591 : &util::filename_cat($importdir,$rf);
592
593 if (-d $full_rf) {
594 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
595 } else {
596 push(@full_reindex_files,$full_rf);
597 }
598 }
599
600 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
601 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
602
603 # And now to ensure the new version of the file processed by
604 # appropriate plugin, we need to add it to block_hash reindex list
605 foreach my $full_rf (@full_reindex_files) {
606 $block_hash->{'reindex_files'}->{$full_rf} = 1;
607 }
608
609
610 #
611 # 3. Now finally any new files - add to block_hash new_files list
612 #
613
614 my @new_files = keys %{$manifest_lookup->{'index'}};
615 my @full_new_files = ();
616
617 foreach my $nf (@new_files) {
618 # ensure filename is absolute
619 my $full_nf =
620 (&util::filename_is_absolute($nf))
621 ? $nf
622 : &util::filename_cat($importdir,$nf);
623
624 if (-d $full_nf) {
625 &add_dir_contents_to_list($full_nf, \@full_new_files);
626 } else {
627 push(@full_new_files,$full_nf);
628 }
629 }
630
631 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
632 my $arcinfodb_map = {};
633 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
634 foreach my $f (@full_new_files) {
635 # check that we haven't seen it already
636 if (defined $arcinfodb_map->{$f}) {
637 # TODO make better warning
638 print STDERR "Warning: $f already in src archive, \n";
639 } else {
640 $block_hash->{'new_files'}->{$f} = 1;
641 }
642 }
643
644 undef $arcinfodb_map;
645 }
646 else {
647 # if incremental, we read through the import folder to see whats changed.
648
649 if ($incremental || $incremental_mode eq "onlyadd") {
650 prime_doc_oid_count($archivedir);
651
652 # Can now work out which files were new, already existed, and have
653 # been deleted
654
655 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
656 $archivedir,$verbosity,$incremental_mode);
657
658 my @new_files = sort keys %{$block_hash->{'new_files'}};
659 if (scalar(@new_files>0)) {
660 print STDERR "New files and modified metadata files since last import:\n ";
661 print STDERR join("\n ",@new_files), "\n";
662 }
663
664 if ($incremental) {
665 # only look for deletions if we are truely incremental
666 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
667 # Filter out any in gsdl/tmp area
668 my @filtered_deleted_files = ();
669 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
670 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
671 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
672 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
673
674 foreach my $df (@deleted_files) {
675 next if ($df =~ m/^$gsdl_tmp_area/);
676 next if ($df =~ m/^$collect_tmp_area/);
677
678 push(@filtered_deleted_files,$df);
679 }
680
681
682 @deleted_files = @filtered_deleted_files;
683
684 if (scalar(@deleted_files)>0) {
685 print STDERR "Files deleted since last import:\n ";
686 print STDERR join("\n ",@deleted_files), "\n";
687
688
689 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
690
691 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
692 }
693
694 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
695
696 if (scalar(@reindex_files)>0) {
697 print STDERR "Files to reindex since last import:\n ";
698 print STDERR join("\n ",@reindex_files), "\n";
699 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
700 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
701 }
702
703 }
704 }
705 }
706
707 # now, whichever mode we are in, we can process the entire import folder
708 if ((defined $jobs) && ($jobs > 1))
709 {
710 # if jobs are set to >1, run in parallel using MPI helper
711 # [hs, 1 july 2010]
712 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
713 $self->{'collection'}, $self->{'site'});
714 }
715 else
716 {
717 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
718 }
719
720
721 if ($saveas eq "FedoraMETS") {
722 # create collection "doc obj" for Fedora that contains
723 # collection-level metadata
724
725 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
726 $doc_obj->set_OID("collection");
727
728 my $col_name = undef;
729 my $col_meta = $collectcfg->{'collectionmeta'};
730
731 if (defined $col_meta) {
732 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
733 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
734 }
735 $processor->process($doc_obj);
736 }
737
738 &plugin::end($pluginfo, $processor);
739
740 &plugin::deinit($pluginfo, $processor);
741
742 # Store the value of OIDCount (used in doc.pm) so it can be
743 # restored correctly to this value on an incremental build
744 store_doc_oid_count($archivedir);
745
746 # write out the archive information file
747 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
748 $processor->close_group_output() if $processor->is_group();
749
750 # for backwards compatability with archvies.inf file
751 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
752 $archive_info->save_info($arcinfo_doc_filename);
753 }
754 else {
755 $archive_info->save_revinfo_db($arcinfo_src_filename);
756 }
757
758 return $pluginfo;
759}
760
761
762sub generate_statistics
763{
764 my $self = shift @_;
765 my ($pluginfo) = @_;
766
767 my $inexport_mode = $self->{'mode'};
768
769 my $statsfile = $self->{'statsfile'};
770 my $out = $self->{'out'};
771 my $faillogname = $self->{'faillogname'};
772 my $gli = $self->{'gli'};
773 my $jobs = $self->{'jobs'};
774
775 # write out import stats
776
777 if ((!defined $jobs) || ($jobs == 1))
778 {
779 # only output statistics if there are multiple jobs
780 # [hs, 1 july 2010]
781
782 my $close_stats = 0;
783 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
784 if (open (STATS, ">$statsfile")) {
785 $statsfile = 'inexport::STATS';
786 $close_stats = 1;
787 } else {
788 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
789 &gsprintf($out, "{import.stats_backup}\n");
790 $statsfile = 'STDERR';
791 }
792 }
793
794 &gsprintf($out, "\n");
795 &gsprintf($out, "*********************************************\n");
796 &gsprintf($out, "{$inexport_mode.complete}\n");
797 &gsprintf($out, "*********************************************\n");
798
799 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
800 if ($close_stats) {
801 close STATS;
802 }
803 }
804
805 close OUT if $self->{'close_out'};
806 close FAILLOG;
807}
808
809
810sub store_collectionmeta
811{
812 my ($collectionmeta,$field,$doc_obj) = @_;
813
814 my $section = $doc_obj->get_top_section();
815
816 my $field_hash = $collectionmeta->{$field};
817
818 foreach my $k (keys %$field_hash)
819 {
820 my $val = $field_hash->{$k};
821
822 ### print STDERR "*** $k = $field_hash->{$k}\n";
823
824 my $md_label = "ex.$field";
825
826
827 if ($k =~ m/^\[l=(.*?)\]$/)
828 {
829
830 my $md_suffix = $1;
831 $md_label .= "^$md_suffix";
832 }
833
834
835 $doc_obj->add_utf8_metadata($section,$md_label, $val);
836
837 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
838 # while "collectionname" in GS2 is called "name" in GS3.
839 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
840 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
841 {
842 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
843 }
844
845 }
846}
847
848
849sub oid_count_file {
850 my ($archivedir) = @_;
851 return &util::filename_cat ($archivedir, "OIDcount");
852}
853
854
855sub prime_doc_oid_count
856{
857 my ($archivedir) = @_;
858 my $oid_count_filename = &oid_count_file($archivedir);
859
860 if (-e $oid_count_filename) {
861 if (open(OIDIN,"<$oid_count_filename")) {
862 my $OIDcount = <OIDIN>;
863 chomp $OIDcount;
864 close(OIDIN);
865
866 $doc::OIDcount = $OIDcount;
867 }
868 else {
869
870 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
871 print STDERR "Setting value to 0\n";
872 }
873 }
874
875}
876
877sub store_doc_oid_count
878{
879 # Use the file "OIDcount" in the archives directory to record
880 # what value doc.pm got up to
881
882 my ($archivedir) = @_;
883 my $oid_count_filename = &oid_count_file($archivedir);
884
885
886 if (open(OIDOUT,">$oid_count_filename")) {
887 print OIDOUT $doc::OIDcount, "\n";
888
889 close(OIDOUT);
890 }
891 else {
892 print STDERR "Warning: unable to store document OID count\n";
893 }
894}
895
896
897
898sub new_vs_old_import_diff
899{
900 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
901
902 # Get the infodbtype value for this collection from the arcinfo object
903 my $infodbtype = $archive_info->{'infodbtype'};
904
905 # in this method, we want to know if metadata files are modified or not.
906 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
907
908 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
909
910 # First convert all files to absolute form
911 # This is to support the situation where the import folder is not
912 # the default
913
914 my $prev_all_files = $archive_info->{'prev_import_filelist'};
915 my $full_prev_all_files = {};
916
917 foreach my $prev_file (keys %$prev_all_files) {
918
919 if (!&util::filename_is_absolute($prev_file)) {
920 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
921 $full_prev_all_files->{$full_prev_file} = $prev_file;
922 }
923 else {
924 $full_prev_all_files->{$prev_file} = $prev_file;
925 }
926 }
927
928
929 # Figure out which are the new files, existing files and so
930 # by implication the files from the previous import that are not
931 # there any more => mark them for deletion
932 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
933
934 my $full_curr_file = $curr_file;
935
936 # entry in 'all_files' is moved to either 'existing_files',
937 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
938
939 if (!&util::filename_is_absolute($curr_file)) {
940 # add in import dir to make absolute
941 $full_curr_file = &util::filename_cat($importdir,$curr_file);
942 }
943
944 # figure out if new file or not
945 if (defined $full_prev_all_files->{$full_curr_file}) {
946 # delete it so that only files that need deleting are left
947 delete $full_prev_all_files->{$full_curr_file};
948
949 # had it before. is it a metadata file?
950 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
951
952 # is it modified??
953 if (-M $full_curr_file < $archiveinf_timestamp) {
954 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
955 # its newer than last build
956 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
957 }
958 }
959 else {
960 if ($incremental_mode eq "all") {
961
962 # had it before
963 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
964
965 }
966 else {
967 # Warning in "onlyadd" mode, but had it before!
968 print STDERR "Warning: File $full_curr_file previously imported.\n";
969 print STDERR " Treating as new file\n";
970
971 $block_hash->{'new_files'}->{$full_curr_file} = 1;
972
973 }
974 }
975 }
976 else {
977 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
978 # the new file is the special sort of file greenstone uses
979 # to attach metadata to src documents
980 # i.e metadata.xml
981 # (but note, the filename used is not constrained in
982 # Greenstone to always be this)
983
984 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
985 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
986 }
987 else {
988 $block_hash->{'new_files'}->{$full_curr_file} = 1;
989 }
990 }
991
992
993 delete $block_hash->{'all_files'}->{$curr_file};
994 }
995
996
997
998
999 # Deal with complication of new or modified metadata files by forcing
1000 # everything from this point down in the file hierarchy to
1001 # be freshly imported.
1002 #
1003 # This may mean files that have not changed are reindexed, but does
1004 # guarantee by the end of processing all new metadata is correctly
1005 # associated with the relevant document(s).
1006
1007 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1008 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1009
1010 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1011 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
1012
1013 # Go through existing_files, and mark anything that is contained
1014 # within 'situated_dir' to be reindexed (in case some of the metadata
1015 # attaches to one of these files)
1016
1017 my $reindex_files = [];
1018
1019 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1020
1021 if ($existing_f =~ m/^$situated_dir/) {
1022
1023 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1024
1025 push(@$reindex_files,$existing_f);
1026 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1027 delete $block_hash->{'existing_files'}->{$existing_f};
1028
1029 }
1030 }
1031
1032 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1033 # (or equivalent)
1034 $block_hash->{'new_files'}->{$new_mdf} = 1;
1035
1036 }
1037
1038 # go through remaining existing files and work out what has changed and needs to be reindexed.
1039 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1040
1041 my $reindex_files = [];
1042
1043 foreach my $existing_filename (@existing_files) {
1044 if (-M $existing_filename < $archiveinf_timestamp) {
1045 # file is newer than last build
1046
1047 my $existing_file = $existing_filename;
1048 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1049
1050 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1051 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1052
1053 print STDERR "**** Reindexing existing file: $existing_file\n";
1054
1055 push(@$reindex_files,$existing_file);
1056 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1057 }
1058
1059 }
1060
1061
1062 # By this point full_prev_all_files contains the files
1063 # mentioned in archiveinf-src.db but are not in the 'import'
1064 # folder (or whatever was specified through -importdir ...)
1065
1066 # This list can contain files that were created in the 'tmp' or
1067 # 'cache' areas (such as screen-size and thumbnail images).
1068 #
1069 # In building the final list of files to delete, we test to see if
1070 # it exists on the filesystem and if it does (unusual for a "normal"
1071 # file in import, but possible in the case of 'tmp' files),
1072 # supress it from going into the final list
1073
1074 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1075
1076 my @deleted_files = values %$full_prev_all_files;
1077 map { my $curr_file = $_;
1078 my $full_curr_file = $curr_file;
1079
1080 if (!&util::filename_is_absolute($curr_file)) {
1081 # add in import dir to make absolute
1082
1083 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1084 }
1085
1086
1087 if (!-e $full_curr_file) {
1088 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1089 }
1090 } @deleted_files;
1091
1092
1093
1094}
1095
1096
1097# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1098# $mode is 'delete' or 'reindex'
1099sub mark_docs_for_deletion
1100{
1101 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1102
1103 my $mode_text = "deleted from index";
1104 if ($mode eq "reindex") {
1105 $mode_text = "reindexed";
1106 }
1107
1108 # Get the infodbtype value for this collection from the arcinfo object
1109 my $infodbtype = $archive_info->{'infodbtype'};
1110
1111 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1112 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1113
1114
1115 # record files marked for deletion in arcinfo
1116 foreach my $file (@$deleted_files) {
1117 # use 'archiveinf-src' info database file to look up all the OIDs
1118 # that this file is used in (note in most cases, it's just one OID)
1119
1120 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1121 my $oids = $src_rec->{'oid'};
1122 my $file_record_deleted = 0;
1123
1124 # delete the src record
1125 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1126 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1127 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1128
1129
1130 foreach my $oid (@$oids) {
1131
1132 # find the source doc (the primary file that becomes this oid)
1133 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1134 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1135 if (!&util::filename_is_absolute($doc_source_file)) {
1136 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1137 }
1138
1139 if ($doc_source_file ne $file) {
1140 # its an associated or metadata file
1141
1142 # mark source doc for reimport as one of its assoc files has changed or deleted
1143 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1144
1145 }
1146 my $curr_status = $archive_info->get_status_info($oid);
1147 if (defined($curr_status) && (($curr_status ne "D"))) {
1148 if ($verbosity>1) {
1149 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1150 }
1151 # mark oid for deletion (it will be deleted or reimported)
1152 $archive_info->set_status_info($oid,"D");
1153 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1154 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1155
1156 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1157 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1158
1159 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1160 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1161 }
1162 }
1163
1164 }
1165 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves. only in delete mode.
1166 if ($mode eq "delete") {
1167 foreach my $file (@$deleted_files) {
1168 if (defined $block_hash->{'reindex_files'}->{$file}) {
1169 delete $block_hash->{'reindex_files'}->{$file};
1170 }
1171 }
1172 }
1173
1174
1175}
1176
1177sub add_dir_contents_to_list {
1178
1179 my ($dirname, $list) = @_;
1180
1181 # Recur over directory contents.
1182 my (@dir, $subfile);
1183
1184 # find all the files in the directory
1185 if (!opendir (DIR, $dirname)) {
1186 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1187 return -1; # error in processing
1188 }
1189 @dir = readdir (DIR);
1190 closedir (DIR);
1191
1192 for (my $i = 0; $i < scalar(@dir); $i++) {
1193 my $subfile = $dir[$i];
1194 next if ($subfile =~ m/^\.\.?$/);
1195 next if ($subfile =~ /^\.svn$/);
1196 my $full_file = &util::filename_cat($dirname, $subfile);
1197 if (-d $full_file) {
1198 &add_dir_contents_to_list($full_file, $list);
1199 } else {
1200 push (@$list, $full_file);
1201 }
1202 }
1203
1204}
1205
1206
12071;
Note: See TracBrowser for help on using the repository browser.