source: main/trunk/greenstone2/perllib/inexport.pm@ 24950

Last change on this file since 24950 was 24829, checked in by ak19, 12 years ago

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:executable set to *
File size: 38.3 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return bless $self, $class;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100
101 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102 {
103 &PrintUsage::print_txt_usage($options, "{import.params}");
104 die "\n";
105 }
106
107 $self->{'close_out'} = 0;
108 my $out = $self->{'out'};
109 if ($out !~ /^(STDERR|STDOUT)$/i) {
110 open (OUT, ">$out") ||
111 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112 $out = 'inexport::OUT';
113 $self->{'close_out'} = 1;
114 }
115 $out->autoflush(1);
116 $self->{'out'} = $out;
117
118 # @ARGV should be only one item, the name of the collection
119 $self->{'collection'} = shift @$argv;
120
121 if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122 require ParallelInexport;
123 }
124
125 return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131 my $class = shift (@_);
132 my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134 my $self = { 'xml' => 0, 'mode' => $mode };
135
136 $self->{'out'} = STDERR;
137
138 if (defined $gsdl_cgi) {
139 $self->{'site'} = $opt_site;
140 my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141 $self->{'collectdir'} = $collect_dir;
142 }
143 else {
144 $self->{'site'} = "";
145 $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146 }
147 $self->{'faillog'} = "";
148
149 $self->{'collection'} = $collect;
150
151 return bless $self, $class;
152}
153sub get_collection
154{
155 my $self = shift @_;
156
157 return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163 my $self = shift @_;
164 my ($collection,$options) = @_;
165
166 my $collectdir = $self->{'collectdir'};
167 my $site = $self->{'site'};
168 my $out = $self->{'out'};
169
170 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171 &PrintUsage::print_txt_usage($options, "{import.params}");
172 die "\n";
173 }
174
175 # add collection's perllib dir into include path in
176 # case we have collection specific modules
177 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
178
179 # check that we can open the faillog
180 my $faillog = $self->{'faillog'};
181 if ($faillog eq "") {
182 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
183 }
184 open (FAILLOG, ">$faillog") ||
185 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
186
187
188 my $faillogname = $faillog;
189 $faillog = 'inexport::FAILLOG';
190 $faillog->autoflush(1);
191 $self->{'faillog'} = $faillog;
192 $self->{'faillogname'} = $faillogname;
193
194 # Read in the collection configuration file.
195 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
196 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
197
198 return ($config_filename,$collectcfg);
199}
200
201sub set_collection_options
202{
203 my $self = shift @_;
204 my ($collectcfg) = @_;
205
206 my $inexport_mode = $self->{'mode'};
207
208 my $verbosity = $self->{'verbosity'};
209 my $debug = $self->{'debug'};
210 my $importdir = $self->{'importdir'};
211 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
212 my $out = $self->{'out'};
213
214 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
215 if (!defined($collectcfg->{'infodbtype'}))
216 {
217 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
218 }
219 if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
220 # we can't use the text version for archives dbs.
221 $collectcfg->{'infodbtype'} = "gdbm";
222 }
223 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
224 $importdir = $collectcfg->{'importdir'};
225 }
226 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
227 $archivedir = $collectcfg->{'archivedir'};
228 }
229 # fill in the default import and archives directories if none
230 # were supplied, turn all \ into / and remove trailing /
231 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
232 $importdir =~ s/[\\\/]+/\//g;
233 $importdir =~ s/\/$//;
234 if (!-e $importdir) {
235 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
236 die "\n";
237 }
238 $self->{'importdir'} = $importdir;
239
240 if ($archivedir eq "") {
241 if ($inexport_mode eq "import") {
242 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
243 }
244 elsif ($inexport_mode eq "export") {
245 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
246 }
247 else {
248 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
249 print STDERR " Defaulting to 'archives' for file output\n";
250 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
251 }
252 }
253
254 $archivedir =~ s/[\\\/]+/\//g;
255 $archivedir =~ s/\/$//;
256 $self->{'archivedir'} = $archivedir;
257
258 if ($verbosity !~ /\d+/) {
259 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
260 $verbosity = $collectcfg->{'verbosity'};
261 } else {
262 $verbosity = 2; # the default
263 }
264 }
265 $self->{'verbosity'} = $verbosity;
266
267 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
268 $self->{'manifest'} = $collectcfg->{'manifest'};
269 }
270
271 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
272 if ($collectcfg->{'gzip'} =~ /^true$/i) {
273 $self->{'gzip'} = 1;
274 }
275 }
276
277 if ($self->{'maxdocs'} !~ /\-?\d+/) {
278 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
279 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
280 } else {
281 $self->{'maxdocs'} = -1; # the default
282 }
283 }
284
285 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
286 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
287 $self->{'groupsize'} = $collectcfg->{'groupsize'};
288 }
289 }
290
291 if (!defined $self->{'OIDtype'}
292 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
293 if (defined $collectcfg->{'OIDtype'}
294 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
295 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
296 } else {
297 $self->{'OIDtype'} = "hash"; # the default
298 }
299 }
300
301 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
302 if (defined $collectcfg->{'OIDmetadata'}) {
303 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
304 } else {
305 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
306 }
307 }
308
309 my $sortmeta = $self->{'sortmeta'};
310 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
311 $sortmeta = $collectcfg->{'sortmeta'};
312 }
313 # sortmeta cannot be used with group size
314 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
315 if (defined $sortmeta && $self->{'groupsize'} > 1) {
316 &gsprintf($out, "{import.cannot_sort}\n\n");
317 $sortmeta = undef;
318 }
319 $self->{'sortmeta'} = $sortmeta;
320
321 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
322 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
323 }
324
325 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
326 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
327 }
328 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
329 $self->{'debug'} = 1;
330 }
331 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
332 $self->{'gli'} = 1;
333 }
334 $self->{'gli'} = 0 unless defined $self->{'gli'};
335
336 # check keepold and removeold
337 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
338
339 my ($removeold, $keepold, $incremental, $incremental_mode)
340 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
341 $self->{'incremental'}, $checkdir,
342 $collectcfg);
343
344 $self->{'removeold'} = $removeold;
345 $self->{'keepold'} = $keepold;
346 $self->{'incremental'} = $incremental;
347 $self->{'incremental_mode'} = $incremental_mode;
348}
349
350sub process_files
351{
352 my $self = shift @_;
353 my ($config_filename,$collectcfg) = @_;
354
355 my $inexport_mode = $self->{'mode'};
356
357 my $verbosity = $self->{'verbosity'};
358 my $debug = $self->{'debug'};
359
360 my $importdir = $self->{'importdir'};
361 my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
362
363 my $incremental = $self->{'incremental'};
364 my $incremental_mode = $self->{'incremental_mode'};
365
366 my $removeold = $self->{'removeold'};
367 my $keepold = $self->{'keepold'};
368
369 my $saveas = $self->{'saveas'};
370 my $OIDtype = $self->{'OIDtype'};
371 my $OIDmetadata = $self->{'OIDmetadata'};
372
373 my $out = $self->{'out'};
374 my $faillog = $self->{'faillog'};
375
376 my $maxdocs = $self->{'maxdocs'};
377 my $gzip = $self->{'gzip'};
378 my $groupsize = $self->{'groupsize'};
379 my $sortmeta = $self->{'sortmeta'};
380
381 my $removeprefix = $self->{'removeprefix'};
382 my $removesuffix = $self->{'removesuffix'};
383
384 my $gli = $self->{'gli'};
385
386 my $jobs = $self->{'jobs'};
387 my $epoch = $self->{'epoch'};
388
389 # related to export
390 my $xsltfile = $self->{'xsltfile'};
391 my $group_marc = $self->{'group_marc'};
392 my $mapping_file = $self->{'mapping_file'};
393 my $xslt_mets = $self->{'xslt_mets'};
394 my $xslt_txt = $self->{'xslt_txt'};
395 my $fedora_namespace = $self->{'fedora_namespace'};
396 my $metadata_prefix = $self->{'metadata_prefix'};
397
398 if ($inexport_mode eq "import") {
399 print STDERR "<Import>\n" if $gli;
400 }
401 else {
402 print STDERR "<export>\n" if $gli;
403 }
404
405 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
406 if ($self->{'manifest'} ne "") {
407 my $manifest_filename = $self->{'manifest'};
408
409 if (!&util::filename_is_absolute($manifest_filename)) {
410 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
411 }
412
413 $self->{'manifest'} =~ s/[\\\/]+/\//g;
414 $self->{'manifest'} =~ s/\/$//;
415
416 $manifest_lookup->parse($manifest_filename);
417 }
418
419 my $manifest = $self->{'manifest'};
420
421 # load all the plugins
422 my $plugins = [];
423 if (defined $collectcfg->{'plugin'}) {
424 $plugins = $collectcfg->{'plugin'};
425 }
426
427 my $plugin_incr_mode = $incremental_mode;
428 if ($manifest ne "") {
429 # if we have a manifest file, then we pretend we are fully incremental for plugins
430 $plugin_incr_mode = "all";
431 }
432 #some global options for the plugins
433 my @global_opts = ();
434
435 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
436 if (scalar(@$pluginfo) == 0) {
437 &gsprintf($out, "{import.no_plugins_loaded}\n");
438 die "\n";
439 }
440
441 # remove the old contents of the archives directory (and tmp
442 # directory) if needed
443
444 if ($removeold) {
445 if (-e $archivedir) {
446 &gsprintf($out, "{import.removing_archives}\n");
447 &util::rm_r ($archivedir);
448 }
449 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
450 $tmpdir =~ s/[\\\/]+/\//g;
451 $tmpdir =~ s/\/$//;
452 if (-e $tmpdir) {
453 &gsprintf($out, "{import.removing_tmpdir}\n");
454 &util::rm_r ($tmpdir);
455 }
456 }
457
458 # create the archives dir if needed
459 &util::mk_all_dir($archivedir);
460
461 # read the archive information file
462
463 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
464 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
465 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
466
467 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
468 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
469
470 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
471 $archive_info->load_info ($arcinfo_doc_filename);
472
473 if ($manifest eq "") {
474 # Load in list of files in import folder from last import (if present)
475 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
476 }
477
478 ####Use Plugout####
479 my $plugout;
480
481 if ($inexport_mode eq "import") {
482 if (defined $collectcfg->{'plugout'}) {
483 # If a plugout was specified in the collect.cfg file, assume it is sensible
484 # We can't check the name because it could be anything, if it is a custom plugout
485 $plugout = $collectcfg->{'plugout'};
486 }
487 else{
488 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
489 push @$plugout,"GreenstoneXMLPlugout";
490 }
491 else{
492 push @$plugout,$saveas."Plugout";
493 }
494 }
495 }
496 else {
497 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
498 $plugout = $collectcfg->{'plugout'};
499 }
500 else{
501 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
502 push @$plugout,"GreenstoneMETSPlugout";
503 }
504 else{
505 push @$plugout,$saveas."Plugout";
506 }
507 }
508 }
509
510 my $plugout_name = $plugout->[0];
511
512 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
513 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
514 push @$plugout,("-debug") if ($debug);
515 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
516 push @$plugout,("-gzip_output") if ($gzip);
517 push @$plugout,("-output_handle",$out) if (defined $out);
518
519 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
520
521 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
522 push @$plugout,("-group") if ($group_marc);
523 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
524 }
525 if ($plugout_name =~ m/^.*METSPlugout$/) {
526 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
527 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
528 }
529
530 if ($plugout_name eq "FedoraMETSPlugout") {
531 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
532 }
533
534 if ($plugout_name eq "DSpacePlugout") {
535 push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");
536 }
537
538 my $processor = &plugout::load_plugout($plugout);
539 $processor->setoutputdir ($archivedir);
540 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
541 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
542
543 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
544
545 if ($removeold) {
546 # occasionally, plugins may want to do something on remove
547 # old, eg pharos image indexing
548 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
549 }
550
551 # process the import directory
552 my $block_hash = {};
553 $block_hash->{'new_files'} = {};
554 $block_hash->{'reindex_files'} = {};
555 my $metadata = {};
556
557 # global blocking pass may set up some metadata
558 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
559
560 if ($manifest ne "") {
561 #
562 # 1. Process delete files first
563 #
564 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
565 my @full_deleted_files = ();
566
567 # ensure all filenames are absolute
568 foreach my $df (@deleted_files) {
569 my $full_df =
570 (&util::filename_is_absolute($df))
571 ? $df
572 : &util::filename_cat($importdir,$df);
573
574 if (-d $full_df) {
575 &add_dir_contents_to_list($full_df, \@full_deleted_files);
576 } else {
577 push(@full_deleted_files,$full_df);
578 }
579 }
580
581 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
582 mark_docs_for_deletion($archive_info,{},
583 \@full_deleted_files,
584 $archivedir, $verbosity, "delete");
585
586
587 #
588 # 2. Now files for reindexing
589 #
590
591 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
592 my @full_reindex_files = ();
593 # ensure all filenames are absolute
594 foreach my $rf (@reindex_files) {
595 my $full_rf =
596 (&util::filename_is_absolute($rf))
597 ? $rf
598 : &util::filename_cat($importdir,$rf);
599
600 if (-d $full_rf) {
601 &add_dir_contents_to_list($full_rf, \@full_reindex_files);
602 } else {
603 push(@full_reindex_files,$full_rf);
604 }
605 }
606
607 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
608 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
609
610 # And now to ensure the new version of the file processed by
611 # appropriate plugin, we need to add it to block_hash reindex list
612 foreach my $full_rf (@full_reindex_files) {
613 $block_hash->{'reindex_files'}->{$full_rf} = 1;
614 }
615
616
617 #
618 # 3. Now finally any new files - add to block_hash new_files list
619 #
620
621 my @new_files = keys %{$manifest_lookup->{'index'}};
622 my @full_new_files = ();
623
624 foreach my $nf (@new_files) {
625 # ensure filename is absolute
626 my $full_nf =
627 (&util::filename_is_absolute($nf))
628 ? $nf
629 : &util::filename_cat($importdir,$nf);
630
631 if (-d $full_nf) {
632 &add_dir_contents_to_list($full_nf, \@full_new_files);
633 } else {
634 push(@full_new_files,$full_nf);
635 }
636 }
637
638 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
639 my $arcinfodb_map = {};
640 &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
641 foreach my $f (@full_new_files) {
642 # check that we haven't seen it already
643 if (defined $arcinfodb_map->{$f}) {
644 # TODO make better warning
645 print STDERR "Warning: $f already in src archive, \n";
646 } else {
647 $block_hash->{'new_files'}->{$f} = 1;
648 }
649 }
650
651 undef $arcinfodb_map;
652 }
653 else {
654 # if incremental, we read through the import folder to see whats changed.
655
656 if ($incremental || $incremental_mode eq "onlyadd") {
657 prime_doc_oid_count($archivedir);
658
659 # Can now work out which files were new, already existed, and have
660 # been deleted
661
662 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
663 $archivedir,$verbosity,$incremental_mode);
664
665 my @new_files = sort keys %{$block_hash->{'new_files'}};
666 if (scalar(@new_files>0)) {
667 print STDERR "New files and modified metadata files since last import:\n ";
668 print STDERR join("\n ",@new_files), "\n";
669 }
670
671 if ($incremental) {
672 # only look for deletions if we are truely incremental
673 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
674 # Filter out any in gsdl/tmp area
675 my @filtered_deleted_files = ();
676 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
677 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
678 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
679 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
680
681 foreach my $df (@deleted_files) {
682 next if ($df =~ m/^$gsdl_tmp_area/);
683 next if ($df =~ m/^$collect_tmp_area/);
684
685 push(@filtered_deleted_files,$df);
686 }
687
688
689 @deleted_files = @filtered_deleted_files;
690
691 if (scalar(@deleted_files)>0) {
692 print STDERR "Files deleted since last import:\n ";
693 print STDERR join("\n ",@deleted_files), "\n";
694
695
696 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
697
698 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
699 }
700
701 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
702
703 if (scalar(@reindex_files)>0) {
704 print STDERR "Files to reindex since last import:\n ";
705 print STDERR join("\n ",@reindex_files), "\n";
706 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
707 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
708 }
709
710 }
711 }
712 }
713
714 # Check for existence of the file that's to contain earliestDateStamp in archivesdir
715 # Do nothing if the file already exists (file exists on incremental build).
716 # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
717 # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
718 # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
719 # oailastmodified and oailastmodifieddate
720 my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
721 if (!-f $earliestDatestampFile && -d $archivedir) {
722 my $current_time_in_seconds = time; # in seconds
723
724 if(open(FOUT, ">$earliestDatestampFile")) {
725 # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
726 print FOUT $current_time_in_seconds;
727 close(FOUT);
728 }
729 else {
730 &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
731 }
732
733 }
734
735 # now, whichever mode we are in, we can process the entire import folder
736 if ((defined $jobs) && ($jobs > 1))
737 {
738 # if jobs are set to >1, run in parallel using MPI helper
739 # [hs, 1 july 2010]
740 &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
741 $self->{'collection'}, $self->{'site'});
742 }
743 else
744 {
745 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
746 }
747
748
749 if ($saveas eq "FedoraMETS") {
750 # create collection "doc obj" for Fedora that contains
751 # collection-level metadata
752
753 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
754 $doc_obj->set_OID("collection");
755
756 my $col_name = undef;
757 my $col_meta = $collectcfg->{'collectionmeta'};
758
759 if (defined $col_meta) {
760 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
761 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
762 }
763 $processor->process($doc_obj);
764 }
765
766 &plugin::end($pluginfo, $processor);
767
768 &plugin::deinit($pluginfo, $processor);
769
770 # Store the value of OIDCount (used in doc.pm) so it can be
771 # restored correctly to this value on an incremental build
772 store_doc_oid_count($archivedir);
773
774 # write out the archive information file
775 $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
776 $processor->close_group_output() if $processor->is_group();
777
778 # for backwards compatability with archvies.inf file
779 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
780 $archive_info->save_info($arcinfo_doc_filename);
781 }
782 else {
783 $archive_info->save_revinfo_db($arcinfo_src_filename);
784 }
785
786 return $pluginfo;
787}
788
789
790sub generate_statistics
791{
792 my $self = shift @_;
793 my ($pluginfo) = @_;
794
795 my $inexport_mode = $self->{'mode'};
796
797 my $statsfile = $self->{'statsfile'};
798 my $out = $self->{'out'};
799 my $faillogname = $self->{'faillogname'};
800 my $gli = $self->{'gli'};
801 my $jobs = $self->{'jobs'};
802
803 # write out import stats
804
805 if ((!defined $jobs) || ($jobs == 1))
806 {
807 # only output statistics if there are multiple jobs
808 # [hs, 1 july 2010]
809
810 my $close_stats = 0;
811 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
812 if (open (STATS, ">$statsfile")) {
813 $statsfile = 'inexport::STATS';
814 $close_stats = 1;
815 } else {
816 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
817 &gsprintf($out, "{import.stats_backup}\n");
818 $statsfile = 'STDERR';
819 }
820 }
821
822 &gsprintf($out, "\n");
823 &gsprintf($out, "*********************************************\n");
824 &gsprintf($out, "{$inexport_mode.complete}\n");
825 &gsprintf($out, "*********************************************\n");
826
827 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
828 if ($close_stats) {
829 close STATS;
830 }
831 }
832
833 close OUT if $self->{'close_out'};
834 close FAILLOG;
835}
836
837
838sub store_collectionmeta
839{
840 my ($collectionmeta,$field,$doc_obj) = @_;
841
842 my $section = $doc_obj->get_top_section();
843
844 my $field_hash = $collectionmeta->{$field};
845
846 foreach my $k (keys %$field_hash)
847 {
848 my $val = $field_hash->{$k};
849
850 ### print STDERR "*** $k = $field_hash->{$k}\n";
851
852 my $md_label = "ex.$field";
853
854
855 if ($k =~ m/^\[l=(.*?)\]$/)
856 {
857
858 my $md_suffix = $1;
859 $md_label .= "^$md_suffix";
860 }
861
862
863 $doc_obj->add_utf8_metadata($section,$md_label, $val);
864
865 # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
866 # while "collectionname" in GS2 is called "name" in GS3.
867 # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
868 if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
869 {
870 $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
871 }
872
873 }
874}
875
876
877sub oid_count_file {
878 my ($archivedir) = @_;
879 return &util::filename_cat ($archivedir, "OIDcount");
880}
881
882
883sub prime_doc_oid_count
884{
885 my ($archivedir) = @_;
886 my $oid_count_filename = &oid_count_file($archivedir);
887
888 if (-e $oid_count_filename) {
889 if (open(OIDIN,"<$oid_count_filename")) {
890 my $OIDcount = <OIDIN>;
891 chomp $OIDcount;
892 close(OIDIN);
893
894 $doc::OIDcount = $OIDcount;
895 }
896 else {
897 &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
898 }
899 }
900
901}
902
903sub store_doc_oid_count
904{
905 # Use the file "OIDcount" in the archives directory to record
906 # what value doc.pm got up to
907
908 my ($archivedir) = @_;
909 my $oid_count_filename = &oid_count_file($archivedir);
910
911
912 if (open(OIDOUT,">$oid_count_filename")) {
913 print OIDOUT $doc::OIDcount, "\n";
914
915 close(OIDOUT);
916 }
917 else {
918 &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
919 }
920}
921
922
923
924sub new_vs_old_import_diff
925{
926 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
927
928 # Get the infodbtype value for this collection from the arcinfo object
929 my $infodbtype = $archive_info->{'infodbtype'};
930
931 # in this method, we want to know if metadata files are modified or not.
932 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
933
934 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
935
936 # First convert all files to absolute form
937 # This is to support the situation where the import folder is not
938 # the default
939
940 my $prev_all_files = $archive_info->{'prev_import_filelist'};
941 my $full_prev_all_files = {};
942
943 foreach my $prev_file (keys %$prev_all_files) {
944
945 if (!&util::filename_is_absolute($prev_file)) {
946 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
947 $full_prev_all_files->{$full_prev_file} = $prev_file;
948 }
949 else {
950 $full_prev_all_files->{$prev_file} = $prev_file;
951 }
952 }
953
954
955 # Figure out which are the new files, existing files and so
956 # by implication the files from the previous import that are not
957 # there any more => mark them for deletion
958 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
959
960 my $full_curr_file = $curr_file;
961
962 # entry in 'all_files' is moved to either 'existing_files',
963 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
964
965 if (!&util::filename_is_absolute($curr_file)) {
966 # add in import dir to make absolute
967 $full_curr_file = &util::filename_cat($importdir,$curr_file);
968 }
969
970 # figure out if new file or not
971 if (defined $full_prev_all_files->{$full_curr_file}) {
972 # delete it so that only files that need deleting are left
973 delete $full_prev_all_files->{$full_curr_file};
974
975 # had it before. is it a metadata file?
976 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
977
978 # is it modified??
979 if (-M $full_curr_file < $archiveinf_timestamp) {
980 print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
981 # its newer than last build
982 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
983 }
984 }
985 else {
986 if ($incremental_mode eq "all") {
987
988 # had it before
989 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
990
991 }
992 else {
993 # Warning in "onlyadd" mode, but had it before!
994 print STDERR "Warning: File $full_curr_file previously imported.\n";
995 print STDERR " Treating as new file\n";
996
997 $block_hash->{'new_files'}->{$full_curr_file} = 1;
998
999 }
1000 }
1001 }
1002 else {
1003 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1004 # the new file is the special sort of file greenstone uses
1005 # to attach metadata to src documents
1006 # i.e metadata.xml
1007 # (but note, the filename used is not constrained in
1008 # Greenstone to always be this)
1009
1010 print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1011 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1012 }
1013 else {
1014 $block_hash->{'new_files'}->{$full_curr_file} = 1;
1015 }
1016 }
1017
1018
1019 delete $block_hash->{'all_files'}->{$curr_file};
1020 }
1021
1022
1023
1024
1025 # Deal with complication of new or modified metadata files by forcing
1026 # everything from this point down in the file hierarchy to
1027 # be freshly imported.
1028 #
1029 # This may mean files that have not changed are reindexed, but does
1030 # guarantee by the end of processing all new metadata is correctly
1031 # associated with the relevant document(s).
1032
1033 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1034 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1035
1036 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1037 $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1038
1039 # Go through existing_files, and mark anything that is contained
1040 # within 'situated_dir' to be reindexed (in case some of the metadata
1041 # attaches to one of these files)
1042
1043 my $reindex_files = [];
1044
1045 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1046
1047 if ($existing_f =~ m/^$situated_dir/) {
1048
1049 print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1050
1051 push(@$reindex_files,$existing_f);
1052 $block_hash->{'reindex_files'}->{$existing_f} = 1;
1053 delete $block_hash->{'existing_files'}->{$existing_f};
1054
1055 }
1056 }
1057
1058 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1059 # (or equivalent)
1060 $block_hash->{'new_files'}->{$new_mdf} = 1;
1061
1062 }
1063
1064 # go through remaining existing files and work out what has changed and needs to be reindexed.
1065 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1066
1067 my $reindex_files = [];
1068
1069 foreach my $existing_filename (@existing_files) {
1070 if (-M $existing_filename < $archiveinf_timestamp) {
1071 # file is newer than last build
1072
1073 my $existing_file = $existing_filename;
1074 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1075
1076 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1077 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1078
1079 print STDERR "**** Reindexing existing file: $existing_file\n";
1080
1081 push(@$reindex_files,$existing_file);
1082 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1083 }
1084
1085 }
1086
1087
1088 # By this point full_prev_all_files contains the files
1089 # mentioned in archiveinf-src.db but are not in the 'import'
1090 # folder (or whatever was specified through -importdir ...)
1091
1092 # This list can contain files that were created in the 'tmp' or
1093 # 'cache' areas (such as screen-size and thumbnail images).
1094 #
1095 # In building the final list of files to delete, we test to see if
1096 # it exists on the filesystem and if it does (unusual for a "normal"
1097 # file in import, but possible in the case of 'tmp' files),
1098 # supress it from going into the final list
1099
1100 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1101
1102 my @deleted_files = values %$full_prev_all_files;
1103 map { my $curr_file = $_;
1104 my $full_curr_file = $curr_file;
1105
1106 if (!&util::filename_is_absolute($curr_file)) {
1107 # add in import dir to make absolute
1108
1109 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1110 }
1111
1112
1113 if (!-e $full_curr_file) {
1114 $block_hash->{'deleted_files'}->{$curr_file} = 1;
1115 }
1116 } @deleted_files;
1117
1118
1119
1120}
1121
1122
1123# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1124# $mode is 'delete' or 'reindex'
1125sub mark_docs_for_deletion
1126{
1127 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1128
1129 my $mode_text = "deleted from index";
1130 if ($mode eq "reindex") {
1131 $mode_text = "reindexed";
1132 }
1133
1134 # Get the infodbtype value for this collection from the arcinfo object
1135 my $infodbtype = $archive_info->{'infodbtype'};
1136
1137 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1138 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1139
1140
1141 # record files marked for deletion in arcinfo
1142 foreach my $file (@$deleted_files) {
1143 # use 'archiveinf-src' info database file to look up all the OIDs
1144 # that this file is used in (note in most cases, it's just one OID)
1145
1146 my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1147 my $oids = $src_rec->{'oid'};
1148 my $file_record_deleted = 0;
1149
1150 # delete the src record
1151 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1152 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1153 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1154
1155
1156 foreach my $oid (@$oids) {
1157
1158 # find the source doc (the primary file that becomes this oid)
1159 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1160 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1161 if (!&util::filename_is_absolute($doc_source_file)) {
1162 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1163 }
1164
1165 if ($doc_source_file ne $file) {
1166 # its an associated or metadata file
1167
1168 # mark source doc for reimport as one of its assoc files has changed or deleted
1169 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1170
1171 }
1172 my $curr_status = $archive_info->get_status_info($oid);
1173 if (defined($curr_status) && (($curr_status ne "D"))) {
1174 if ($verbosity>1) {
1175 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1176 }
1177 # mark oid for deletion (it will be deleted or reimported)
1178 $archive_info->set_status_info($oid,"D");
1179 my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1180 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1181
1182 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1183 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1184
1185 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1186 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1187 }
1188 }
1189
1190 }
1191
1192 # now go through and check that we haven't marked any primary
1193 # files for reindex (because their associated files have
1194 # changed/deleted) when they have been deleted themselves. only in
1195 # delete mode.
1196
1197 if ($mode eq "delete") {
1198 foreach my $file (@$deleted_files) {
1199 if (defined $block_hash->{'reindex_files'}->{$file}) {
1200 delete $block_hash->{'reindex_files'}->{$file};
1201 }
1202 }
1203 }
1204
1205
1206}
1207
1208sub add_dir_contents_to_list {
1209
1210 my ($dirname, $list) = @_;
1211
1212 # Recur over directory contents.
1213 my (@dir, $subfile);
1214
1215 # find all the files in the directory
1216 if (!opendir (DIR, $dirname)) {
1217 print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1218 return -1; # error in processing
1219 }
1220 @dir = readdir (DIR);
1221 closedir (DIR);
1222
1223 for (my $i = 0; $i < scalar(@dir); $i++) {
1224 my $subfile = $dir[$i];
1225 next if ($subfile =~ m/^\.\.?$/);
1226 next if ($subfile =~ /^\.svn$/);
1227 my $full_file = &util::filename_cat($dirname, $subfile);
1228 if (-d $full_file) {
1229 &add_dir_contents_to_list($full_file, $list);
1230 } else {
1231 push (@$list, $full_file);
1232 }
1233 }
1234
1235}
1236
1237
12381;
Note: See TracBrowser for help on using the repository browser.