source: main/trunk/greenstone2/perllib/inexport.pm@ 26451

Last change on this file since 26451 was 26451, checked in by ak19, 8 years ago
  1. Fixed processing of the collectionconfig's indexOption element. 2. Correct set of changes for processing the new toplevel importOptions and buildOptions elements of collectionConfig.xml (which can contain such options as specify OIDtype, OIDmetadata, verbosity). 3. Undoing previous commits, since the importOptions and buildOptions elememts are not nested inside plugins but are one of the toplevel elements of collectionConfig.xml. And do not need the recently-committed changes to inexport.pm either, since any command line args for import and buildOptions will override what's in collectionConfig.xml anyway.
  • Property svn:executable set to *
File size: 38.5 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # set gs_verison 2/3
176 $self->{'gs_version'} = "2";
177 if ((defined $site) && ($site ne "")) {
178 # gs3
179 $self->{'gs_version'} = "3";
180 }
181 # add collection's perllib dir into include path in
182 # case we have collection specific modules
183 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
184
185 # check that we can open the faillog
186 my $faillog = $self->{'faillog'};
187 if ($faillog eq "") {
188 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
189 }
190 open (FAILLOG, ">$faillog") ||
191 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
192
193
194 my $faillogname = $faillog;
195 $faillog = 'inexport::FAILLOG';
196 $faillog->autoflush(1);
197 $self->{'faillog'} = $faillog;
198 $self->{'faillogname'} = $faillogname;
199
200 # Read in the collection configuration file.
201 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
202 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
203
204 return ($config_filename,$collectcfg);
205}
206
207sub set_collection_options
208{
209 my $self = shift @_;
210 my ($collectcfg) = @_;
211
212 my $inexport_mode = $self->{'mode'};
213
214 my $verbosity = $self->{'verbosity'};
215 my $debug = $self->{'debug'};
216 my $importdir = $self->{'importdir'};
217 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
218 my $out = $self->{'out'};
219
220 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
221 if (!defined($collectcfg->{'infodbtype'}))
222 {
223 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
224 }
225 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
226 # we can't use the text version for archives dbs.
227 $collectcfg->{'infodbtype'} = "gdbm";
228 }
229
230 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
231 $importdir = $collectcfg->{'importdir'};
232 }
233 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
234 $archivedir = $collectcfg->{'archivedir'};
235 }
236 # fill in the default import and archives directories if none
237 # were supplied, turn all \ into / and remove trailing /
238 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
239 $importdir =~ s/[\\\/]+/\//g;
240 $importdir =~ s/\/$//;
241 if (!-e $importdir) {
242 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
243 die "\n";
244 }
245 $self->{'importdir'} = $importdir;
246
247 if ($archivedir eq "") {
248 if ($inexport_mode eq "import") {
249 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
250 }
251 elsif ($inexport_mode eq "export") {
252 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
253 }
254 else {
255 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
256 print STDERR " Defaulting to 'archives' for file output\n";
257 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
258 }
259 }
260
261 $archivedir =~ s/[\\\/]+/\//g;
262 $archivedir =~ s/\/$//;
263 $self->{'archivedir'} = $archivedir;
264
265 if ($verbosity !~ /\d+/) {
266 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
267 $verbosity = $collectcfg->{'verbosity'};
268 } else {
269 $verbosity = 2; # the default
270 }
271 }
272 $self->{'verbosity'} = $verbosity;
273
274 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
275 $self->{'manifest'} = $collectcfg->{'manifest'};
276 }
277
278 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
279 if ($collectcfg->{'gzip'} =~ /^true$/i) {
280 $self->{'gzip'} = 1;
281 }
282 }
283
284 if ($self->{'maxdocs'} !~ /\-?\d+/) {
285 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
286 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
287 } else {
288 $self->{'maxdocs'} = -1; # the default
289 }
290 }
291
292 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
293 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
294 $self->{'groupsize'} = $collectcfg->{'groupsize'};
295 }
296 }
297
298 if (!defined $self->{'OIDtype'}
299 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
300 if (defined $collectcfg->{'OIDtype'}
301 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
302 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
303 } else {
304 $self->{'OIDtype'} = "hash"; # the default
305 }
306 }
307
308 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
309 if (defined $collectcfg->{'OIDmetadata'}) {
310 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
311 } else {
312 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
313 }
314 }
315
316 my $sortmeta = $self->{'sortmeta'};
317 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
318 $sortmeta = $collectcfg->{'sortmeta'};
319 }
320 # sortmeta cannot be used with group size
321 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
322 if (defined $sortmeta && $self->{'groupsize'} > 1) {
323 &gsprintf($out, "{import.cannot_sort}\n\n");
324 $sortmeta = undef;
325 }
326 $self->{'sortmeta'} = $sortmeta;
327
328 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
329 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
330 }
331
332 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
333 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
334 }
335 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
336 $self->{'debug'} = 1;
337 }
338 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
339 $self->{'gli'} = 1;
340 }
341 $self->{'gli'} = 0 unless defined $self->{'gli'};
342
343 # check keepold and removeold
344 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
345
346 my ($removeold, $keepold, $incremental, $incremental_mode)
347 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
348 $self->{'incremental'}, $checkdir,
349 $collectcfg);
350
351 $self->{'removeold'} = $removeold;
352 $self->{'keepold'} = $keepold;
353 $self->{'incremental'} = $incremental;
354 $self->{'incremental_mode'} = $incremental_mode;
355}
356
357sub process_files
358{
359 my $self = shift @_;
360 my ($config_filename,$collectcfg) = @_;
361
362 my $inexport_mode = $self->{'mode'};
363
364 my $verbosity = $self->{'verbosity'};
365 my $debug = $self->{'debug'};
366
367 my $importdir = $self->{'importdir'};
368 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
369
370 my $incremental = $self->{'incremental'};
371 my $incremental_mode = $self->{'incremental_mode'};
372
373 my $gs_version = $self->{'gs_version'};
374
375 my $removeold = $self->{'removeold'};
376 my $keepold = $self->{'keepold'};
377
378 my $saveas = $self->{'saveas'};
379 my $OIDtype = $self->{'OIDtype'};
380 my $OIDmetadata = $self->{'OIDmetadata'};
381
382 my $out = $self->{'out'};
383 my $faillog = $self->{'faillog'};
384
385 my $maxdocs = $self->{'maxdocs'};
386 my $gzip = $self->{'gzip'};
387 my $groupsize = $self->{'groupsize'};
388 my $sortmeta = $self->{'sortmeta'};
389
390 my $removeprefix = $self->{'removeprefix'};
391 my $removesuffix = $self->{'removesuffix'};
392
393 my $gli = $self->{'gli'};
394
395 my $jobs = $self->{'jobs'};
396 my $epoch = $self->{'epoch'};
397
398 # related to export
399 my $xsltfile = $self->{'xsltfile'};
400 my $group_marc = $self->{'group_marc'};
401 my $mapping_file = $self->{'mapping_file'};
402 my $xslt_mets = $self->{'xslt_mets'};
403 my $xslt_txt = $self->{'xslt_txt'};
404 my $fedora_namespace = $self->{'fedora_namespace'};
405 my $metadata_prefix = $self->{'metadata_prefix'};
406
407 if ($inexport_mode eq "import") {
408 print STDERR "<Import>\n" if $gli;
409 }
410 else {
411 print STDERR "<export>\n" if $gli;
412 }
413
414 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
415 if ($self->{'manifest'} ne "") {
416 my $manifest_filename = $self->{'manifest'};
417
418 if (!&util::filename_is_absolute($manifest_filename)) {
419 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
420 }
421
422 $self->{'manifest'} =~ s/[\\\/]+/\//g;
423 $self->{'manifest'} =~ s/\/$//;
424
425 $manifest_lookup->parse($manifest_filename);
426 }
427
428 my $manifest = $self->{'manifest'};
429
430 # load all the plugins
431 my $plugins = [];
432 if (defined $collectcfg->{'plugin'}) {
433 $plugins = $collectcfg->{'plugin'};
434 }
435
436 my $plugin_incr_mode = $incremental_mode;
437 if ($manifest ne "") {
438 # if we have a manifest file, then we pretend we are fully incremental for plugins
439 $plugin_incr_mode = "all";
440 }
441 #some global options for the plugins
442 my @global_opts = ();
443
444 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
445 if (scalar(@$pluginfo) == 0) {
446 &gsprintf($out, "{import.no_plugins_loaded}\n");
447 die "\n";
448 }
449
450 # remove the old contents of the archives directory (and tmp
451 # directory) if needed
452
453 if ($removeold) {
454 if (-e $archivedir) {
455 &gsprintf($out, "{import.removing_archives}\n");
456 &util::rm_r ($archivedir);
457 }
458 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
459 $tmpdir =~ s/[\\\/]+/\//g;
460 $tmpdir =~ s/\/$//;
461 if (-e $tmpdir) {
462 &gsprintf($out, "{import.removing_tmpdir}\n");
463 &util::rm_r ($tmpdir);
464 }
465 }
466
467 # create the archives dir if needed
468 &util::mk_all_dir($archivedir);
469
470 # read the archive information file
471
472 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
473 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
474 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
475
476 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
477 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
478
479 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
480 $archive_info->load_info ($arcinfo_doc_filename);
481
482 if ($manifest eq "") {
483 # Load in list of files in import folder from last import (if present)
484 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
485 }
486
487 ####Use Plugout####
488 my $plugout;
489
490 if ($inexport_mode eq "import") {
491 if (defined $collectcfg->{'plugout'}) {
492 # If a plugout was specified in the collect.cfg file, assume it is sensible
493 # We can't check the name because it could be anything, if it is a custom plugout
494 $plugout = $collectcfg->{'plugout'};
495 }
496 else{
497 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
498 push @$plugout,"GreenstoneXMLPlugout";
499 }
500 else{
501 push @$plugout,$saveas."Plugout";
502 }
503 }
504 }
505 else {
506 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
507 $plugout = $collectcfg->{'plugout'};
508 }
509 else{
510 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
511 push @$plugout,"GreenstoneMETSPlugout";
512 }
513 else{
514 push @$plugout,$saveas."Plugout";
515 }
516 }
517 }
518
519 my $plugout_name = $plugout->[0];
520
521 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
522 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
523 push @$plugout,("-debug") if ($debug);
524 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
525 push @$plugout,("-gzip_output") if ($gzip);
526 push @$plugout,("-output_handle",$out) if (defined $out);
527
528 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
529
530 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
531 push @$plugout,("-group") if ($group_marc);
532 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
533 }
534 if ($plugout_name =~ m/^.*METSPlugout$/) {
535 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
536 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
537 }
538
539 if ($plugout_name eq "FedoraMETSPlugout") {
540 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
541 }
542
543 if ($plugout_name eq "DSpacePlugout") {
544 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
545 }
546
547 my $processor = &plugout::load_plugout($plugout);
548 $processor->setoutputdir ($archivedir);
549 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
550 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
551
552 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
553
554 if ($removeold) {
555 # occasionally, plugins may want to do something on remove
556 # old, eg pharos image indexing
557 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
558 }
559
560 # process the import directory
561 my $block_hash = {};
562 $block_hash->{'new_files'} = {};
563 $block_hash->{'reindex_files'} = {};
564 my $metadata = {};
565
566 # global blocking pass may set up some metadata
567 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
568
569 if ($manifest ne "") {
570 #
571 # 1. Process delete files first
572 #
573 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
574 my @full_deleted_files = ();
575
576 # ensure all filenames are absolute
577 foreach my $df (@deleted_files) {
578 my $full_df =
579 (&util::filename_is_absolute($df))
580 ? $df
581 : &util::filename_cat($importdir,$df);
582
583 if (-d $full_df) {
584 &add_dir_contents_to_list($full_df, \@full_deleted_files);
585 } else {
586 push(@full_deleted_files,$full_df);
587 }
588 }
589
590 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
591 mark_docs_for_deletion($archive_info,{},
592 \@full_deleted_files,
593 $archivedir, $verbosity, "delete");
594
595
596 #
597 # 2. Now files for reindexing
598 #
599
600 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
601 my @full_reindex_files = ();
602 # ensure all filenames are absolute
603 foreach my $rf (@reindex_files) {
604 my $full_rf =
605 (&util::filename_is_absolute($rf))
606 ? $rf
607 : &util::filename_cat($importdir,$rf);
608
609 if (-d $full_rf) {
610 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
611 } else {
612 push(@full_reindex_files,$full_rf);
613 }
614 }
615
616 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
617 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
618
619 # And now to ensure the new version of the file processed by
620 # appropriate plugin, we need to add it to block_hash reindex list
621 foreach my $full_rf (@full_reindex_files) {
622 $block_hash->{'reindex_files'}->{$full_rf} = 1;
623 }
624
625
626 #
627 # 3. Now finally any new files - add to block_hash new_files list
628 #
629
630 my @new_files = keys %{$manifest_lookup->{'index'}};
631 my @full_new_files = ();
632
633 foreach my $nf (@new_files) {
634 # ensure filename is absolute
635 my $full_nf =
636 (&util::filename_is_absolute($nf))
637 ? $nf
638 : &util::filename_cat($importdir,$nf);
639
640 if (-d $full_nf) {
641 &add_dir_contents_to_list($full_nf, \@full_new_files);
642 } else {
643 push(@full_new_files,$full_nf);
644 }
645 }
646
647 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
648 my $arcinfodb_map = {};
649 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
650 foreach my $f (@full_new_files) {
651 # check that we haven't seen it already
652 if (defined $arcinfodb_map->{$f}) {
653 # TODO make better warning
654 print STDERR "Warning: $f already in src archive, \n";
655 } else {
656 $block_hash->{'new_files'}->{$f} = 1;
657 }
658 }
659
660 undef $arcinfodb_map;
661 }
662 else {
663 # if incremental, we read through the import folder to see whats changed.
664
665 if ($incremental || $incremental_mode eq "onlyadd") {
666 prime_doc_oid_count($archivedir);
667
668 # Can now work out which files were new, already existed, and have
669 # been deleted
670
671 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
672 $archivedir,$verbosity,$incremental_mode);
673
674 my @new_files = sort keys %{$block_hash->{'new_files'}};
675 if (scalar(@new_files>0)) {
676 print STDERR "New files and modified metadata files since last import:\n ";
677 print STDERR join("\n ",@new_files), "\n";
678 }
679
680 if ($incremental) {
681 # only look for deletions if we are truely incremental
682 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
683 # Filter out any in gsdl/tmp area
684 my @filtered_deleted_files = ();
685 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
686 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
687 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
688 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
689
690 foreach my $df (@deleted_files) {
691 next if ($df =~ m/^$gsdl_tmp_area/);
692 next if ($df =~ m/^$collect_tmp_area/);
693
694 push(@filtered_deleted_files,$df);
695 }
696
697
698 @deleted_files = @filtered_deleted_files;
699
700 if (scalar(@deleted_files)>0) {
701 print STDERR "Files deleted since last import:\n ";
702 print STDERR join("\n ",@deleted_files), "\n";
703
704
705 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
706
707 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
708 }
709
710 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
711
712 if (scalar(@reindex_files)>0) {
713 print STDERR "Files to reindex since last import:\n ";
714 print STDERR join("\n ",@reindex_files), "\n";
715 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
716 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
717 }
718
719 }
720 }
721 }
722
723 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
724 # Do nothing if the file already exists (file exists on incremental build).
725 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
726 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
727 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
728 # oailastmodified and oailastmodifieddate
729 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
730 if (!-f $earliestDatestampFile && -d $archivedir) {
731 my $current_time_in_seconds = time; # in seconds
732
733 if(open(FOUT, ">$earliestDatestampFile")) {
734 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
735 print FOUT $current_time_in_seconds;
736 close(FOUT);
737 }
738 else {
739 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
740 }
741
742 }
743
744 # now, whichever mode we are in, we can process the entire import folder
745 if ((defined $jobs) && ($jobs > 1))
746 {
747 # if jobs are set to >1, run in parallel using MPI helper
748 # [hs, 1 july 2010]
749 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
750 $self->{'collection'}, $self->{'site'});
751 }
752 else
753 {
754 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
755 }
756
757
758 if ($saveas eq "FedoraMETS") {
759 # create collection "doc obj" for Fedora that contains
760 # collection-level metadata
761
762 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
763 $doc_obj->set_OID("collection");
764
765 my $col_name = undef;
766 my $col_meta = $collectcfg->{'collectionmeta'};
767
768 if (defined $col_meta) {
769 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
770 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
771 }
772 $processor->process($doc_obj);
773 }
774
775 &plugin::end($pluginfo, $processor);
776
777 &plugin::deinit($pluginfo, $processor);
778
779 # Store the value of OIDCount (used in doc.pm) so it can be
780 # restored correctly to this value on an incremental build
781 store_doc_oid_count($archivedir);
782
783 # write out the archive information file
784 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
785 $processor->close_group_output() if $processor->is_group();
786
787 # for backwards compatability with archvies.inf file
788 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
789 $archive_info->save_info($arcinfo_doc_filename);
790 }
791 else {
792 $archive_info->save_revinfo_db($arcinfo_src_filename);
793 }
794
795 return $pluginfo;
796}
797
798
799sub generate_statistics
800{
801 my $self = shift @_;
802 my ($pluginfo) = @_;
803
804 my $inexport_mode = $self->{'mode'};
805
806 my $statsfile = $self->{'statsfile'};
807 my $out = $self->{'out'};
808 my $faillogname = $self->{'faillogname'};
809 my $gli = $self->{'gli'};
810 my $jobs = $self->{'jobs'};
811
812 # write out import stats
813
814 if ((!defined $jobs) || ($jobs == 1))
815 {
816 # only output statistics if there are multiple jobs
817 # [hs, 1 july 2010]
818
819 my $close_stats = 0;
820 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
821 if (open (STATS, ">$statsfile")) {
822 $statsfile = 'inexport::STATS';
823 $close_stats = 1;
824 } else {
825 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
826 &gsprintf($out, "{import.stats_backup}\n");
827 $statsfile = 'STDERR';
828 }
829 }
830
831 &gsprintf($out, "\n");
832 &gsprintf($out, "*********************************************\n");
833 &gsprintf($out, "{$inexport_mode.complete}\n");
834 &gsprintf($out, "*********************************************\n");
835
836 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
837 if ($close_stats) {
838 close STATS;
839 }
840 }
841
842 close OUT if $self->{'close_out'};
843 close FAILLOG;
844}
845
846
847sub store_collectionmeta
848{
849 my ($collectionmeta,$field,$doc_obj) = @_;
850
851 my $section = $doc_obj->get_top_section();
852
853 my $field_hash = $collectionmeta->{$field};
854
855 foreach my $k (keys %$field_hash)
856 {
857 my $val = $field_hash->{$k};
858
859 ### print STDERR "*** $k = $field_hash->{$k}\n";
860
861 my $md_label = "ex.$field";
862
863
864 if ($k =~ m/^\[l=(.*?)\]$/)
865 {
866
867 my $md_suffix = $1;
868 $md_label .= "^$md_suffix";
869 }
870
871
872 $doc_obj->add_utf8_metadata($section,$md_label, $val);
873
874 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
875 # while "collectionname" in GS2 is called "name" in GS3.
876 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
877 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
878 {
879 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
880 }
881
882 }
883}
884
885
886sub oid_count_file {
887 my ($archivedir) = @_;
888 return &util::filename_cat ($archivedir, "OIDcount");
889}
890
891
892sub prime_doc_oid_count
893{
894 my ($archivedir) = @_;
895 my $oid_count_filename = &oid_count_file($archivedir);
896
897 if (-e $oid_count_filename) {
898 if (open(OIDIN,"<$oid_count_filename")) {
899 my $OIDcount = <OIDIN>;
900 chomp $OIDcount;
901 close(OIDIN);
902
903 $doc::OIDcount = $OIDcount;
904 }
905 else {
906 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
907 }
908 }
909
910}
911
912sub store_doc_oid_count
913{
914 # Use the file "OIDcount" in the archives directory to record
915 # what value doc.pm got up to
916
917 my ($archivedir) = @_;
918 my $oid_count_filename = &oid_count_file($archivedir);
919
920
921 if (open(OIDOUT,">$oid_count_filename")) {
922 print OIDOUT $doc::OIDcount, "\n";
923
924 close(OIDOUT);
925 }
926 else {
927 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
928 }
929}
930
931
932
933sub new_vs_old_import_diff
934{
935 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
936
937 # Get the infodbtype value for this collection from the arcinfo object
938 my $infodbtype = $archive_info->{'infodbtype'};
939
940 # in this method, we want to know if metadata files are modified or not.
941 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
942
943 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
944
945 # First convert all files to absolute form
946 # This is to support the situation where the import folder is not
947 # the default
948
949 my $prev_all_files = $archive_info->{'prev_import_filelist'};
950 my $full_prev_all_files = {};
951
952 foreach my $prev_file (keys %$prev_all_files) {
953
954 if (!&util::filename_is_absolute($prev_file)) {
955 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
956 $full_prev_all_files->{$full_prev_file} = $prev_file;
957 }
958 else {
959 $full_prev_all_files->{$prev_file} = $prev_file;
960 }
961 }
962
963
964 # Figure out which are the new files, existing files and so
965 # by implication the files from the previous import that are not
966 # there any more => mark them for deletion
967 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
968
969 my $full_curr_file = $curr_file;
970
971 # entry in 'all_files' is moved to either 'existing_files',
972 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
973
974 if (!&util::filename_is_absolute($curr_file)) {
975 # add in import dir to make absolute
976 $full_curr_file = &util::filename_cat($importdir,$curr_file);
977 }
978
979 # figure out if new file or not
980 if (defined $full_prev_all_files->{$full_curr_file}) {
981 # delete it so that only files that need deleting are left
982 delete $full_prev_all_files->{$full_curr_file};
983
984 # had it before. is it a metadata file?
985 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
986
987 # is it modified??
988 if (-M $full_curr_file < $archiveinf_timestamp) {
989 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
990 # its newer than last build
991 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
992 }
993 }
994 else {
995 if ($incremental_mode eq "all") {
996
997 # had it before
998 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
999
1000 }
1001 else {
1002 # Warning in "onlyadd" mode, but had it before!
1003 print STDERR "Warning: File $full_curr_file previously imported.\n";
1004 print STDERR " Treating as new file\n";
1005
1006 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1007
1008 }
1009 }
1010 }
1011 else {
1012 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1013 # the new file is the special sort of file greenstone uses
1014 # to attach metadata to src documents
1015 # i.e metadata.xml
1016 # (but note, the filename used is not constrained in
1017 # Greenstone to always be this)
1018
1019 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1020 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1021 }
1022 else {
1023 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1024 }
1025 }
1026
1027
1028 delete $block_hash->{'all_files'}->{$curr_file};
1029 }
1030
1031
1032
1033
1034 # Deal with complication of new or modified metadata files by forcing
1035 # everything from this point down in the file hierarchy to
1036 # be freshly imported.
1037 #
1038 # This may mean files that have not changed are reindexed, but does
1039 # guarantee by the end of processing all new metadata is correctly
1040 # associated with the relevant document(s).
1041
1042 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1043 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1044
1045 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1046 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1047
1048 # Go through existing_files, and mark anything that is contained
1049 # within 'situated_dir' to be reindexed (in case some of the metadata
1050 # attaches to one of these files)
1051
1052 my $reindex_files = [];
1053
1054 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1055
1056 if ($existing_f =~ m/^$situated_dir/) {
1057
1058 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1059
1060 push(@$reindex_files,$existing_f);
1061 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1062 delete $block_hash->{'existing_files'}->{$existing_f};
1063
1064 }
1065 }
1066
1067 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1068 # (or equivalent)
1069 $block_hash->{'new_files'}->{$new_mdf} = 1;
1070
1071 }
1072
1073 # go through remaining existing files and work out what has changed and needs to be reindexed.
1074 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1075
1076 my $reindex_files = [];
1077
1078 foreach my $existing_filename (@existing_files) {
1079 if (-M $existing_filename < $archiveinf_timestamp) {
1080 # file is newer than last build
1081
1082 my $existing_file = $existing_filename;
1083 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1084
1085 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1086 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1087
1088 print STDERR "**** Reindexing existing file: $existing_file\n";
1089
1090 push(@$reindex_files,$existing_file);
1091 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1092 }
1093
1094 }
1095
1096
1097 # By this point full_prev_all_files contains the files
1098 # mentioned in archiveinf-src.db but are not in the 'import'
1099 # folder (or whatever was specified through -importdir ...)
1100
1101 # This list can contain files that were created in the 'tmp' or
1102 # 'cache' areas (such as screen-size and thumbnail images).
1103 #
1104 # In building the final list of files to delete, we test to see if
1105 # it exists on the filesystem and if it does (unusual for a "normal"
1106 # file in import, but possible in the case of 'tmp' files),
1107 # supress it from going into the final list
1108
1109 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1110
1111 my @deleted_files = values %$full_prev_all_files;
1112 map { my $curr_file = $_;
1113 my $full_curr_file = $curr_file;
1114
1115 if (!&util::filename_is_absolute($curr_file)) {
1116 # add in import dir to make absolute
1117
1118 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1119 }
1120
1121
1122 if (!-e $full_curr_file) {
1123 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1124 }
1125 } @deleted_files;
1126
1127
1128
1129}
1130
1131
1132# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1133# $mode is 'delete' or 'reindex'
1134sub mark_docs_for_deletion
1135{
1136 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1137
1138 my $mode_text = "deleted from index";
1139 if ($mode eq "reindex") {
1140 $mode_text = "reindexed";
1141 }
1142
1143 # Get the infodbtype value for this collection from the arcinfo object
1144 my $infodbtype = $archive_info->{'infodbtype'};
1145
1146 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1147 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1148
1149
1150 # record files marked for deletion in arcinfo
1151 foreach my $file (@$deleted_files) {
1152 # use 'archiveinf-src' info database file to look up all the OIDs
1153 # that this file is used in (note in most cases, it's just one OID)
1154
1155 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1156 my $oids = $src_rec->{'oid'};
1157 my $file_record_deleted = 0;
1158
1159 # delete the src record
1160 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1161 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1162 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1163
1164
1165 foreach my $oid (@$oids) {
1166
1167 # find the source doc (the primary file that becomes this oid)
1168 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1169 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1170 if (!&util::filename_is_absolute($doc_source_file)) {
1171 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1172 }
1173
1174 if ($doc_source_file ne $file) {
1175 # its an associated or metadata file
1176
1177 # mark source doc for reimport as one of its assoc files has changed or deleted
1178 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1179
1180 }
1181 my $curr_status = $archive_info->get_status_info($oid);
1182 if (defined($curr_status) && (($curr_status ne "D"))) {
1183 if ($verbosity>1) {
1184 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1185 }
1186 # mark oid for deletion (it will be deleted or reimported)
1187 $archive_info->set_status_info($oid,"D");
1188 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1189 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1190
1191 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1192 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1193
1194 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1195 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1196 }
1197 }
1198
1199 }
1200
1201 # now go through and check that we haven't marked any primary
1202 # files for reindex (because their associated files have
1203 # changed/deleted) when they have been deleted themselves. only in
1204 # delete mode.
1205
1206 if ($mode eq "delete") {
1207 foreach my $file (@$deleted_files) {
1208 if (defined $block_hash->{'reindex_files'}->{$file}) {
1209 delete $block_hash->{'reindex_files'}->{$file};
1210 }
1211 }
1212 }
1213
1214
1215}
1216
1217sub add_dir_contents_to_list {
1218
1219 my ($dirname, $list) = @_;
1220
1221 # Recur over directory contents.
1222 my (@dir, $subfile);
1223
1224 # find all the files in the directory
1225 if (!opendir (DIR, $dirname)) {
1226 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1227 return -1; # error in processing
1228 }
1229 @dir = readdir (DIR);
1230 closedir (DIR);
1231
1232 for (my $i = 0; $i < scalar(@dir); $i++) {
1233 my $subfile = $dir[$i];
1234 next if ($subfile =~ m/^\.\.?$/);
1235 next if ($subfile =~ /^\.svn$/);
1236 my $full_file = &util::filename_cat($dirname, $subfile);
1237 if (-d $full_file) {
1238 &add_dir_contents_to_list($full_file, $list);
1239 } else {
1240 push (@$list, $full_file);
1241 }
1242 }
1243
1244}
1245
1246
12471;
Note: See TracBrowser for help on using the repository browser.