source: main/trunk/greenstone2/perllib/inexport.pm@ 23825

Last change on this file since 23825 was 23825, checked in by sjm84, 13 years ago

Phase two of commiting the files changed to extend the DSpace exporting capabilities to include more than just dublin core metadata

  • Property svn:executable set to *
File size: 37.2 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # add collection's perllib dir into include path in
176 # case we have collection specific modules
177 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
178
179 # check that we can open the faillog
180 my $faillog = $self->{'faillog'};
181 if ($faillog eq "") {
182 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
183 }
184 open (FAILLOG, ">$faillog") ||
185 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
186
187
188 my $faillogname = $faillog;
189 $faillog = 'inexport::FAILLOG';
190 $faillog->autoflush(1);
191 $self->{'faillog'} = $faillog;
192 $self->{'faillogname'} = $faillogname;
193
194 # Read in the collection configuration file.
195 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
196 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
197
198 return ($config_filename,$collectcfg);
199}
200
201sub set_collection_options
202{
203 my $self = shift @_;
204 my ($collectcfg) = @_;
205
206 my $inexport_mode = $self->{'mode'};
207
208 my $verbosity = $self->{'verbosity'};
209 my $debug = $self->{'debug'};
210 my $importdir = $self->{'importdir'};
211 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
212 my $out = $self->{'out'};
213
214 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
215 if (!defined($collectcfg->{'infodbtype'}))
216 {
217 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
218 }
219 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
220 # we can't use the text version for archives dbs.
221 $collectcfg->{'infodbtype'} = "gdbm";
222 }
223 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
224 $importdir = $collectcfg->{'importdir'};
225 }
226 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
227 $archivedir = $collectcfg->{'archivedir'};
228 }
229 # fill in the default import and archives directories if none
230 # were supplied, turn all \ into / and remove trailing /
231 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
232 $importdir =~ s/[\\\/]+/\//g;
233 $importdir =~ s/\/$//;
234 if (!-e $importdir) {
235 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
236 die "\n";
237 }
238 $self->{'importdir'} = $importdir;
239
240 if ($archivedir eq "") {
241 if ($inexport_mode eq "import") {
242 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
243 }
244 elsif ($inexport_mode eq "export") {
245 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
246 }
247 else {
248 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
249 print STDERR " Defaulting to 'archives' for file output\n";
250 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
251 }
252 }
253
254 $archivedir =~ s/[\\\/]+/\//g;
255 $archivedir =~ s/\/$//;
256 $self->{'archivedir'} = $archivedir;
257
258 if ($verbosity !~ /\d+/) {
259 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
260 $verbosity = $collectcfg->{'verbosity'};
261 } else {
262 $verbosity = 2; # the default
263 }
264 }
265 $self->{'verbosity'} = $verbosity;
266
267 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
268 $self->{'manifest'} = $collectcfg->{'manifest'};
269 }
270
271 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
272 if ($collectcfg->{'gzip'} =~ /^true$/i) {
273 $self->{'gzip'} = 1;
274 }
275 }
276
277 if ($self->{'maxdocs'} !~ /\-?\d+/) {
278 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
279 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
280 } else {
281 $self->{'maxdocs'} = -1; # the default
282 }
283 }
284
285 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
286 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
287 $self->{'groupsize'} = $collectcfg->{'groupsize'};
288 }
289 }
290
291 if (!defined $self->{'OIDtype'}
292 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
293 if (defined $collectcfg->{'OIDtype'}
294 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
295 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
296 } else {
297 $self->{'OIDtype'} = "hash"; # the default
298 }
299 }
300
301 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
302 if (defined $collectcfg->{'OIDmetadata'}) {
303 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
304 } else {
305 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
306 }
307 }
308
309 my $sortmeta = $self->{'sortmeta'};
310 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
311 $sortmeta = $collectcfg->{'sortmeta'};
312 }
313 # sortmeta cannot be used with group size
314 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
315 if (defined $sortmeta && $self->{'groupsize'} > 1) {
316 &gsprintf($out, "{import.cannot_sort}\n\n");
317 $sortmeta = undef;
318 }
319 $self->{'sortmeta'} = $sortmeta;
320
321 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
322 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
323 }
324
325 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
326 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
327 }
328 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
329 $self->{'debug'} = 1;
330 }
331 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
332 $self->{'gli'} = 1;
333 }
334 $self->{'gli'} = 0 unless defined $self->{'gli'};
335
336 # check keepold and removeold
337 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
338
339 my ($removeold, $keepold, $incremental, $incremental_mode)
340 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
341 $self->{'incremental'}, $checkdir,
342 $collectcfg);
343
344 $self->{'removeold'} = $removeold;
345 $self->{'keepold'} = $keepold;
346 $self->{'incremental'} = $incremental;
347 $self->{'incremental_mode'} = $incremental_mode;
348}
349
350sub process_files
351{
352 my $self = shift @_;
353 my ($config_filename,$collectcfg) = @_;
354
355 my $inexport_mode = $self->{'mode'};
356
357 my $verbosity = $self->{'verbosity'};
358 my $debug = $self->{'debug'};
359
360 my $importdir = $self->{'importdir'};
361 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
362
363 my $incremental = $self->{'incremental'};
364 my $incremental_mode = $self->{'incremental_mode'};
365
366 my $removeold = $self->{'removeold'};
367 my $keepold = $self->{'keepold'};
368
369 my $saveas = $self->{'saveas'};
370 my $OIDtype = $self->{'OIDtype'};
371 my $OIDmetadata = $self->{'OIDmetadata'};
372
373 my $out = $self->{'out'};
374 my $faillog = $self->{'faillog'};
375
376 my $maxdocs = $self->{'maxdocs'};
377 my $gzip = $self->{'gzip'};
378 my $groupsize = $self->{'groupsize'};
379 my $sortmeta = $self->{'sortmeta'};
380
381 my $removeprefix = $self->{'removeprefix'};
382 my $removesuffix = $self->{'removesuffix'};
383
384 my $gli = $self->{'gli'};
385
386 my $jobs = $self->{'jobs'};
387 my $epoch = $self->{'epoch'};
388
389 # related to export
390 my $xsltfile = $self->{'xsltfile'};
391 my $group_marc = $self->{'group_marc'};
392 my $mapping_file = $self->{'mapping_file'};
393 my $xslt_mets = $self->{'xslt_mets'};
394 my $xslt_txt = $self->{'xslt_txt'};
395 my $fedora_namespace = $self->{'fedora_namespace'};
396 my $metadata_prefix = $self->{'metadata_prefix'};
397
398 if ($inexport_mode eq "import") {
399 print STDERR "<Import>\n" if $gli;
400 }
401 else {
402 print STDERR "<export>\n" if $gli;
403 }
404
405 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
406 if ($self->{'manifest'} ne "") {
407 my $manifest_filename = $self->{'manifest'};
408
409 if (!&util::filename_is_absolute($manifest_filename)) {
410 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
411 }
412
413 $self->{'manifest'} =~ s/[\\\/]+/\//g;
414 $self->{'manifest'} =~ s/\/$//;
415
416 $manifest_lookup->parse($manifest_filename);
417 }
418
419 my $manifest = $self->{'manifest'};
420
421 # load all the plugins
422 my $plugins = [];
423 if (defined $collectcfg->{'plugin'}) {
424 $plugins = $collectcfg->{'plugin'};
425 }
426
427 my $plugin_incr_mode = $incremental_mode;
428 if ($manifest ne "") {
429 # if we have a manifest file, then we pretend we are fully incremental for plugins
430 $plugin_incr_mode = "all";
431 }
432 #some global options for the plugins
433 my @global_opts = ();
434
435 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
436 if (scalar(@$pluginfo) == 0) {
437 &gsprintf($out, "{import.no_plugins_loaded}\n");
438 die "\n";
439 }
440
441 # remove the old contents of the archives directory (and tmp directory) if needed
442 if ($removeold) {
443 if (-e $archivedir) {
444 &gsprintf($out, "{import.removing_archives}\n");
445 &util::rm_r ($archivedir);
446 }
447 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
448 $tmpdir =~ s/[\\\/]+/\//g;
449 $tmpdir =~ s/\/$//;
450 if (-e $tmpdir) {
451 &gsprintf($out, "{import.removing_tmpdir}\n");
452 &util::rm_r ($tmpdir);
453 }
454 }
455
456 # create the archives dir if needed
457 &util::mk_all_dir($archivedir);
458
459 # read the archive information file
460
461 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
462 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
463 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
464
465 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
466 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
467
468 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
469 $archive_info->load_info ($arcinfo_doc_filename);
470
471 if ($manifest eq "") {
472 # Load in list of files in import folder from last import (if present)
473 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
474 }
475
476 ####Use Plugout####
477 my $plugout;
478
479 if ($inexport_mode eq "import") {
480 if (defined $collectcfg->{'plugout'}) {
481 # If a plugout was specified in the collect.cfg file, assume it is sensible
482 # We can't check the name because it could be anything, if it is a custom plugout
483 $plugout = $collectcfg->{'plugout'};
484 }
485 else{
486 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
487 push @$plugout,"GreenstoneXMLPlugout";
488 }
489 else{
490 push @$plugout,$saveas."Plugout";
491 }
492 }
493 }
494 else {
495 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
496 $plugout = $collectcfg->{'plugout'};
497 }
498 else{
499 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
500 push @$plugout,"GreenstoneMETSPlugout";
501 }
502 else{
503 push @$plugout,$saveas."Plugout";
504 }
505 }
506 }
507
508 my $plugout_name = $plugout->[0];
509
510 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
511 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
512 push @$plugout,("-debug") if ($debug);
513 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
514 push @$plugout,("-gzip_output") if ($gzip);
515 push @$plugout,("-output_handle",$out) if (defined $out);
516
517 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
518
519 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
520 push @$plugout,("-group") if ($group_marc);
521 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
522 }
523 if ($plugout_name =~ m/^.*METSPlugout$/) {
524 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
525 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
526 }
527
528 if ($plugout_name eq "FedoraMETSPlugout") {
529 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
530 }
531
532 if ($plugout_name eq "DSpacePlugout") {
533 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
534 }
535
536 my $processor = &plugout::load_plugout($plugout);
537 $processor->setoutputdir ($archivedir);
538 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
539 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
540
541 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
542
543 if ($removeold) {
544 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
545 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
546 }
547
548 # process the import directory
549 my $block_hash = {};
550 $block_hash->{'new_files'} = {};
551 $block_hash->{'reindex_files'} = {};
552 my $metadata = {};
553
554 # gobal blocking pass may set up some metadata
555 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
556
557 if ($manifest ne "") {
558 #
559 # 1. Process delete files first
560 #
561 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
562 my @full_deleted_files = ();
563
564 # ensure all filenames are absolute
565 foreach my $df (@deleted_files) {
566 my $full_df =
567 (&util::filename_is_absolute($df))
568 ? $df
569 : &util::filename_cat($importdir,$df);
570
571 if (-d $full_df) {
572 &add_dir_contents_to_list($full_df, \@full_deleted_files);
573 } else {
574 push(@full_deleted_files,$full_df);
575 }
576 }
577
578 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
579 mark_docs_for_deletion($archive_info,{},
580 \@full_deleted_files,
581 $archivedir, $verbosity, "delete");
582
583
584 #
585 # 2. Now files for reindexing
586 #
587
588 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
589 my @full_reindex_files = ();
590 # ensure all filenames are absolute
591 foreach my $rf (@reindex_files) {
592 my $full_rf =
593 (&util::filename_is_absolute($rf))
594 ? $rf
595 : &util::filename_cat($importdir,$rf);
596
597 if (-d $full_rf) {
598 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
599 } else {
600 push(@full_reindex_files,$full_rf);
601 }
602 }
603
604 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
605 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
606
607 # And now to ensure the new version of the file processed by
608 # appropriate plugin, we need to add it to block_hash reindex list
609 foreach my $full_rf (@full_reindex_files) {
610 $block_hash->{'reindex_files'}->{$full_rf} = 1;
611 }
612
613
614 #
615 # 3. Now finally any new files - add to block_hash new_files list
616 #
617
618 my @new_files = keys %{$manifest_lookup->{'index'}};
619 my @full_new_files = ();
620
621 foreach my $nf (@new_files) {
622 # ensure filename is absolute
623 my $full_nf =
624 (&util::filename_is_absolute($nf))
625 ? $nf
626 : &util::filename_cat($importdir,$nf);
627
628 if (-d $full_nf) {
629 &add_dir_contents_to_list($full_nf, \@full_new_files);
630 } else {
631 push(@full_new_files,$full_nf);
632 }
633 }
634
635 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
636 my $arcinfodb_map = {};
637 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
638 foreach my $f (@full_new_files) {
639 # check that we haven't seen it already
640 if (defined $arcinfodb_map->{$f}) {
641 # TODO make better warning
642 print STDERR "Warning: $f already in src archive, \n";
643 } else {
644 $block_hash->{'new_files'}->{$f} = 1;
645 }
646 }
647
648 undef $arcinfodb_map;
649 }
650 else {
651 # if incremental, we read through the import folder to see whats changed.
652
653 if ($incremental || $incremental_mode eq "onlyadd") {
654 prime_doc_oid_count($archivedir);
655
656 # Can now work out which files were new, already existed, and have
657 # been deleted
658
659 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
660 $archivedir,$verbosity,$incremental_mode);
661
662 my @new_files = sort keys %{$block_hash->{'new_files'}};
663 if (scalar(@new_files>0)) {
664 print STDERR "New files and modified metadata files since last import:\n ";
665 print STDERR join("\n ",@new_files), "\n";
666 }
667
668 if ($incremental) {
669 # only look for deletions if we are truely incremental
670 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
671 # Filter out any in gsdl/tmp area
672 my @filtered_deleted_files = ();
673 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
674 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
675 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
676 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
677
678 foreach my $df (@deleted_files) {
679 next if ($df =~ m/^$gsdl_tmp_area/);
680 next if ($df =~ m/^$collect_tmp_area/);
681
682 push(@filtered_deleted_files,$df);
683 }
684
685
686 @deleted_files = @filtered_deleted_files;
687
688 if (scalar(@deleted_files)>0) {
689 print STDERR "Files deleted since last import:\n ";
690 print STDERR join("\n ",@deleted_files), "\n";
691
692
693 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
694
695 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
696 }
697
698 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
699
700 if (scalar(@reindex_files)>0) {
701 print STDERR "Files to reindex since last import:\n ";
702 print STDERR join("\n ",@reindex_files), "\n";
703 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
704 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
705 }
706
707 }
708 }
709 }
710
711 # now, whichever mode we are in, we can process the entire import folder
712 if ((defined $jobs) && ($jobs > 1))
713 {
714 # if jobs are set to >1, run in parallel using MPI helper
715 # [hs, 1 july 2010]
716 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
717 $self->{'collection'}, $self->{'site'});
718 }
719 else
720 {
721 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
722 }
723
724
725 if ($saveas eq "FedoraMETS") {
726 # create collection "doc obj" for Fedora that contains
727 # collection-level metadata
728
729 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
730 $doc_obj->set_OID("collection");
731
732 my $col_name = undef;
733 my $col_meta = $collectcfg->{'collectionmeta'};
734
735 if (defined $col_meta) {
736 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
737 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
738 }
739 $processor->process($doc_obj);
740 }
741
742 &plugin::end($pluginfo, $processor);
743
744 &plugin::deinit($pluginfo, $processor);
745
746 # Store the value of OIDCount (used in doc.pm) so it can be
747 # restored correctly to this value on an incremental build
748 store_doc_oid_count($archivedir);
749
750 # write out the archive information file
751 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
752 $processor->close_group_output() if $processor->is_group();
753
754 # for backwards compatability with archvies.inf file
755 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
756 $archive_info->save_info($arcinfo_doc_filename);
757 }
758 else {
759 $archive_info->save_revinfo_db($arcinfo_src_filename);
760 }
761
762 return $pluginfo;
763}
764
765
766sub generate_statistics
767{
768 my $self = shift @_;
769 my ($pluginfo) = @_;
770
771 my $inexport_mode = $self->{'mode'};
772
773 my $statsfile = $self->{'statsfile'};
774 my $out = $self->{'out'};
775 my $faillogname = $self->{'faillogname'};
776 my $gli = $self->{'gli'};
777 my $jobs = $self->{'jobs'};
778
779 # write out import stats
780
781 if ((!defined $jobs) || ($jobs == 1))
782 {
783 # only output statistics if there are multiple jobs
784 # [hs, 1 july 2010]
785
786 my $close_stats = 0;
787 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
788 if (open (STATS, ">$statsfile")) {
789 $statsfile = 'inexport::STATS';
790 $close_stats = 1;
791 } else {
792 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
793 &gsprintf($out, "{import.stats_backup}\n");
794 $statsfile = 'STDERR';
795 }
796 }
797
798 &gsprintf($out, "\n");
799 &gsprintf($out, "*********************************************\n");
800 &gsprintf($out, "{$inexport_mode.complete}\n");
801 &gsprintf($out, "*********************************************\n");
802
803 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
804 if ($close_stats) {
805 close STATS;
806 }
807 }
808
809 close OUT if $self->{'close_out'};
810 close FAILLOG;
811}
812
813
814sub store_collectionmeta
815{
816 my ($collectionmeta,$field,$doc_obj) = @_;
817
818 my $section = $doc_obj->get_top_section();
819
820 my $field_hash = $collectionmeta->{$field};
821
822 foreach my $k (keys %$field_hash)
823 {
824 my $val = $field_hash->{$k};
825
826 ### print STDERR "*** $k = $field_hash->{$k}\n";
827
828 my $md_label = "ex.$field";
829
830
831 if ($k =~ m/^\[l=(.*?)\]$/)
832 {
833
834 my $md_suffix = $1;
835 $md_label .= "^$md_suffix";
836 }
837
838
839 $doc_obj->add_utf8_metadata($section,$md_label, $val);
840
841 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
842 # while "collectionname" in GS2 is called "name" in GS3.
843 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
844 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
845 {
846 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
847 }
848
849 }
850}
851
852
853sub oid_count_file {
854 my ($archivedir) = @_;
855 return &util::filename_cat ($archivedir, "OIDcount");
856}
857
858
859sub prime_doc_oid_count
860{
861 my ($archivedir) = @_;
862 my $oid_count_filename = &oid_count_file($archivedir);
863
864 if (-e $oid_count_filename) {
865 if (open(OIDIN,"<$oid_count_filename")) {
866 my $OIDcount = <OIDIN>;
867 chomp $OIDcount;
868 close(OIDIN);
869
870 $doc::OIDcount = $OIDcount;
871 }
872 else {
873
874 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
875 print STDERR "Setting value to 0\n";
876 }
877 }
878
879}
880
881sub store_doc_oid_count
882{
883 # Use the file "OIDcount" in the archives directory to record
884 # what value doc.pm got up to
885
886 my ($archivedir) = @_;
887 my $oid_count_filename = &oid_count_file($archivedir);
888
889
890 if (open(OIDOUT,">$oid_count_filename")) {
891 print OIDOUT $doc::OIDcount, "\n";
892
893 close(OIDOUT);
894 }
895 else {
896 print STDERR "Warning: unable to store document OID count\n";
897 }
898}
899
900
901
902sub new_vs_old_import_diff
903{
904 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
905
906 # Get the infodbtype value for this collection from the arcinfo object
907 my $infodbtype = $archive_info->{'infodbtype'};
908
909 # in this method, we want to know if metadata files are modified or not.
910 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
911
912 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
913
914 # First convert all files to absolute form
915 # This is to support the situation where the import folder is not
916 # the default
917
918 my $prev_all_files = $archive_info->{'prev_import_filelist'};
919 my $full_prev_all_files = {};
920
921 foreach my $prev_file (keys %$prev_all_files) {
922
923 if (!&util::filename_is_absolute($prev_file)) {
924 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
925 $full_prev_all_files->{$full_prev_file} = $prev_file;
926 }
927 else {
928 $full_prev_all_files->{$prev_file} = $prev_file;
929 }
930 }
931
932
933 # Figure out which are the new files, existing files and so
934 # by implication the files from the previous import that are not
935 # there any more => mark them for deletion
936 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
937
938 my $full_curr_file = $curr_file;
939
940 # entry in 'all_files' is moved to either 'existing_files',
941 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
942
943 if (!&util::filename_is_absolute($curr_file)) {
944 # add in import dir to make absolute
945 $full_curr_file = &util::filename_cat($importdir,$curr_file);
946 }
947
948 # figure out if new file or not
949 if (defined $full_prev_all_files->{$full_curr_file}) {
950 # delete it so that only files that need deleting are left
951 delete $full_prev_all_files->{$full_curr_file};
952
953 # had it before. is it a metadata file?
954 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
955
956 # is it modified??
957 if (-M $full_curr_file < $archiveinf_timestamp) {
958 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
959 # its newer than last build
960 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
961 }
962 }
963 else {
964 if ($incremental_mode eq "all") {
965
966 # had it before
967 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
968
969 }
970 else {
971 # Warning in "onlyadd" mode, but had it before!
972 print STDERR "Warning: File $full_curr_file previously imported.\n";
973 print STDERR " Treating as new file\n";
974
975 $block_hash->{'new_files'}->{$full_curr_file} = 1;
976
977 }
978 }
979 }
980 else {
981 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
982 # the new file is the special sort of file greenstone uses
983 # to attach metadata to src documents
984 # i.e metadata.xml
985 # (but note, the filename used is not constrained in
986 # Greenstone to always be this)
987
988 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
989 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
990 }
991 else {
992 $block_hash->{'new_files'}->{$full_curr_file} = 1;
993 }
994 }
995
996
997 delete $block_hash->{'all_files'}->{$curr_file};
998 }
999
1000
1001
1002
1003 # Deal with complication of new or modified metadata files by forcing
1004 # everything from this point down in the file hierarchy to
1005 # be freshly imported.
1006 #
1007 # This may mean files that have not changed are reindexed, but does
1008 # guarantee by the end of processing all new metadata is correctly
1009 # associated with the relevant document(s).
1010
1011 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1012 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1013
1014 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1015 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
1016
1017 # Go through existing_files, and mark anything that is contained
1018 # within 'situated_dir' to be reindexed (in case some of the metadata
1019 # attaches to one of these files)
1020
1021 my $reindex_files = [];
1022
1023 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1024
1025 if ($existing_f =~ m/^$situated_dir/) {
1026
1027 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1028
1029 push(@$reindex_files,$existing_f);
1030 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1031 delete $block_hash->{'existing_files'}->{$existing_f};
1032
1033 }
1034 }
1035
1036 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1037 # (or equivalent)
1038 $block_hash->{'new_files'}->{$new_mdf} = 1;
1039
1040 }
1041
1042 # go through remaining existing files and work out what has changed and needs to be reindexed.
1043 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1044
1045 my $reindex_files = [];
1046
1047 foreach my $existing_filename (@existing_files) {
1048 if (-M $existing_filename < $archiveinf_timestamp) {
1049 # file is newer than last build
1050
1051 my $existing_file = $existing_filename;
1052 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1053
1054 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1055 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1056
1057 print STDERR "**** Reindexing existing file: $existing_file\n";
1058
1059 push(@$reindex_files,$existing_file);
1060 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1061 }
1062
1063 }
1064
1065
1066 # By this point full_prev_all_files contains the files
1067 # mentioned in archiveinf-src.db but are not in the 'import'
1068 # folder (or whatever was specified through -importdir ...)
1069
1070 # This list can contain files that were created in the 'tmp' or
1071 # 'cache' areas (such as screen-size and thumbnail images).
1072 #
1073 # In building the final list of files to delete, we test to see if
1074 # it exists on the filesystem and if it does (unusual for a "normal"
1075 # file in import, but possible in the case of 'tmp' files),
1076 # supress it from going into the final list
1077
1078 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1079
1080 my @deleted_files = values %$full_prev_all_files;
1081 map { my $curr_file = $_;
1082 my $full_curr_file = $curr_file;
1083
1084 if (!&util::filename_is_absolute($curr_file)) {
1085 # add in import dir to make absolute
1086
1087 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1088 }
1089
1090
1091 if (!-e $full_curr_file) {
1092 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1093 }
1094 } @deleted_files;
1095
1096
1097
1098}
1099
1100
1101# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1102# $mode is 'delete' or 'reindex'
1103sub mark_docs_for_deletion
1104{
1105 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1106
1107 my $mode_text = "deleted from index";
1108 if ($mode eq "reindex") {
1109 $mode_text = "reindexed";
1110 }
1111
1112 # Get the infodbtype value for this collection from the arcinfo object
1113 my $infodbtype = $archive_info->{'infodbtype'};
1114
1115 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1116 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1117
1118
1119 # record files marked for deletion in arcinfo
1120 foreach my $file (@$deleted_files) {
1121 # use 'archiveinf-src' info database file to look up all the OIDs
1122 # that this file is used in (note in most cases, it's just one OID)
1123
1124 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1125 my $oids = $src_rec->{'oid'};
1126 my $file_record_deleted = 0;
1127
1128 # delete the src record
1129 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1130 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1131 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1132
1133
1134 foreach my $oid (@$oids) {
1135
1136 # find the source doc (the primary file that becomes this oid)
1137 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1138 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1139 if (!&util::filename_is_absolute($doc_source_file)) {
1140 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1141 }
1142
1143 if ($doc_source_file ne $file) {
1144 # its an associated or metadata file
1145
1146 # mark source doc for reimport as one of its assoc files has changed or deleted
1147 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1148
1149 }
1150 my $curr_status = $archive_info->get_status_info($oid);
1151 if (defined($curr_status) && (($curr_status ne "D"))) {
1152 if ($verbosity>1) {
1153 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1154 }
1155 # mark oid for deletion (it will be deleted or reimported)
1156 $archive_info->set_status_info($oid,"D");
1157 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1158 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1159
1160 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1161 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1162
1163 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1164 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1165 }
1166 }
1167
1168 }
1169 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves. only in delete mode.
1170 if ($mode eq "delete") {
1171 foreach my $file (@$deleted_files) {
1172 if (defined $block_hash->{'reindex_files'}->{$file}) {
1173 delete $block_hash->{'reindex_files'}->{$file};
1174 }
1175 }
1176 }
1177
1178
1179}
1180
1181sub add_dir_contents_to_list {
1182
1183 my ($dirname, $list) = @_;
1184
1185 # Recur over directory contents.
1186 my (@dir, $subfile);
1187
1188 # find all the files in the directory
1189 if (!opendir (DIR, $dirname)) {
1190 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1191 return -1; # error in processing
1192 }
1193 @dir = readdir (DIR);
1194 closedir (DIR);
1195
1196 for (my $i = 0; $i < scalar(@dir); $i++) {
1197 my $subfile = $dir[$i];
1198 next if ($subfile =~ m/^\.\.?$/);
1199 next if ($subfile =~ /^\.svn$/);
1200 my $full_file = &util::filename_cat($dirname, $subfile);
1201 if (-d $full_file) {
1202 &add_dir_contents_to_list($full_file, $list);
1203 } else {
1204 push (@$list, $full_file);
1205 }
1206 }
1207
1208}
1209
1210
12111;
Note: See TracBrowser for help on using the repository browser.