source: main/trunk/greenstone2/perllib/inexport.pm@ 22421

Last change on this file since 22421 was 22421, checked in by davidb, 14 years ago

Continued work on refactoring code to have better shared support for import.pl and export.pl

  • Property svn:executable set to *
File size: 32.9 KB
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use plugin;
37use plugout;
38use manifest;
39use inexport;
40use dbutil;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52 my $class = shift (@_);
53 my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55 my $self = { 'xml' => 0, 'mode' => $mode };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'listall'}) {
75 if ($self->{'xml'}) {
76 &PrintUsage::print_xml_usage($opt_listall_options);
77 }
78 else
79 {
80 &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81 }
82 die "\n";
83 }
84
85
86 if ($self->{'xml'}) {
87 &PrintUsage::print_xml_usage($options);
88 print "\n";
89 return;
90 }
91
92 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93 &gsprintf::output_strings_in_UTF8;
94 }
95
96 # now check that we had exactly one leftover arg, which should be
97 # the collection name. We don't want to do this earlier, cos
98 # -xml arg doesn't need a collection name
99 # Or if the user specified -h, then we output the usage also
100 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
101 {
102 &PrintUsage::print_txt_usage($options, "{import.params}");
103 die "\n";
104 }
105
106 $self->{'close_out'} = 0;
107 my $out = $self->{'out'};
108 if ($out !~ /^(STDERR|STDOUT)$/i) {
109 open (OUT, ">$out") ||
110 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
111 $out = 'import::OUT';
112 $self->{'close_out'} = 1;
113 }
114 $out->autoflush(1);
115 $self->{'out'} = $out;
116
117 # @ARGV should be only one item, the name of the collection
118 $self->{'collection'} = shift @$argv;
119
120 return bless $self, $class;
121}
122
123sub get_collection
124{
125 my $self = shift @_;
126
127 return $self->{'collection'};
128}
129
130
131sub read_collection_cfg
132{
133 my $self = shift @_;
134 my ($collection,$options) = @_;
135
136 my $collectdir = $self->{'collectdir'};
137 my $site = $self->{'site'};
138 my $out = $self->{'out'};
139
140 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
141 &PrintUsage::print_txt_usage($options, "{import.params}");
142 die "\n";
143 }
144
145 # add collection's perllib dir into include path in
146 # case we have collection specific modules
147 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
148
149 # check that we can open the faillog
150 my $faillog = $self->{'faillog'};
151 if ($faillog eq "") {
152 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
153 }
154 open (FAILLOG, ">$faillog") ||
155 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
156
157
158 my $faillogname = $faillog;
159 $faillog = 'inexport::FAILLOG';
160 $faillog->autoflush(1);
161 $self->{'faillog'} = $faillog;
162 $self->{'faillogname'} = $faillogname;
163
164 # Read in the collection configuration file.
165 my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
166 my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
167
168 return ($config_filename,$collectcfg);
169}
170
171sub set_collection_options
172{
173 my $self = shift @_;
174 my ($collectcfg) = @_;
175
176 my $inexport_mode = $self->{'mode'};
177
178 my $verbosity = $self->{'verbosity'};
179 my $debug = $self->{'debug'};
180 my $importdir = $self->{'importdir'};
181 my $archivedir = $self->{'archivedir'};
182 my $out = $self->{'out'};
183
184 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
185 if (!defined($collectcfg->{'infodbtype'}))
186 {
187 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
188 }
189
190 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
191 $importdir = $collectcfg->{'importdir'};
192 }
193 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
194 $archivedir = $collectcfg->{'archivedir'};
195 }
196 # fill in the default import and archives directories if none
197 # were supplied, turn all \ into / and remove trailing /
198 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
199 $importdir =~ s/[\\\/]+/\//g;
200 $importdir =~ s/\/$//;
201 if (!-e $importdir) {
202 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
203 die "\n";
204 }
205 $self->{'importdir'} = $importdir;
206
207 if ($archivedir eq "") {
208 if ($inexport_mode eq "import") {
209 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
210 }
211 elsif ($inexport_mode eq "export") {
212 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
213 }
214 else {
215 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
216 print STDERR " Defaulting to 'archives' for file output\n";
217 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
218 }
219 }
220
221 $archivedir =~ s/[\\\/]+/\//g;
222 $archivedir =~ s/\/$//;
223 $self->{'archivedir'} = $archivedir;
224
225 if ($verbosity !~ /\d+/) {
226 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
227 $verbosity = $collectcfg->{'verbosity'};
228 } else {
229 $verbosity = 2; # the default
230 }
231 }
232 $self->{'verbosity'} = $verbosity;
233
234 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
235 $self->{'manifest'} = $collectcfg->{'manifest'};
236 }
237
238 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
239 if ($collectcfg->{'gzip'} =~ /^true$/i) {
240 $self->{'gzip'} = 1;
241 }
242 }
243
244 if ($self->{'maxdocs'} !~ /\-?\d+/) {
245 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
246 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
247 } else {
248 $self->{'maxdocs'} = -1; # the default
249 }
250 }
251
252 if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
253 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
254 $self->{'groupsize'} = $collectcfg->{'groupsize'};
255 }
256 }
257
258 if (!defined $self->{'OIDtype'}
259 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
260 if (defined $collectcfg->{'OIDtype'}
261 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
262 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
263 } else {
264 $self->{'OIDtype'} = "hash"; # the default
265 }
266 }
267
268 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
269 if (defined $collectcfg->{'OIDmetadata'}) {
270 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
271 } else {
272 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
273 }
274 }
275
276 my $sortmeta = $self->{'sortmeta'};
277 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
278 $sortmeta = $collectcfg->{'sortmeta'};
279 }
280 # sortmeta cannot be used with group size
281 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
282 if (defined $sortmeta && $self->{'groupsize'} > 1) {
283 &gsprintf($out, "{import.cannot_sort}\n\n");
284 $sortmeta = undef;
285 }
286 $self->{'sortmeta'} = $sortmeta;
287
288 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
289 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
290 }
291
292 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
293 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
294 }
295 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
296 $self->{'debug'} = 1;
297 }
298 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
299 $self->{'gli'} = 1;
300 }
301 $self->{'gli'} = 0 unless defined $self->{'gli'};
302
303 # check keepold and removeold
304 my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
305
306 my ($removeold, $keepold, $incremental, $incremental_mode)
307 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
308 $self->{'incremental'}, $checkdir,
309 $collectcfg);
310
311 $self->{'removeold'} = $removeold;
312 $self->{'keepold'} = $keepold;
313 $self->{'incremental'} = $incremental;
314 $self->{'incremental_mode'} = $incremental_mode;
315}
316
317sub process_files
318{
319 my $self = shift @_;
320 my ($config_filename,$collectcfg) = @_;
321
322 my $inexport_mode = $self->{'mode'};
323
324 my $verbosity = $self->{'verbosity'};
325 my $debug = $self->{'debug'};
326
327 my $importdir = $self->{'importdir'};
328 my $archivedir = $self->{'archivedir'};
329
330 my $incremental = $self->{'incremental'};
331 my $incremental_mode = $self->{'incremental_mode'};
332
333 my $removeold = $self->{'removeold'};
334 my $keepold = $self->{'keepold'};
335
336 my $saveas = $self->{'saveas'};
337 my $OIDtype = $self->{'OIDtype'};
338 my $OIDmetadata = $self->{'OIDmetadata'};
339
340 my $out = $self->{'out'};
341 my $faillog = $self->{'faillog'};
342
343 my $maxdocs = $self->{'maxdocs'};
344 my $gzip = $self->{'gzip'};
345 my $groupsize = $self->{'groupsize'};
346 my $sortmeta = $self->{'sortmeta'};
347
348 my $removeprefix = $self->{'removeprefix'};
349 my $removesuffix = $self->{'removesuffix'};
350
351 my $gli = $self->{'gli'};
352
353 # related to export
354 my $xsltfile = $self->{'xsltfile'};
355 my $group_marc = $self->{'group_marc'};
356 my $mapping_file = $self->{'mapping_file'};
357 my $xslt_mets = $self->{'xslt_mets'};
358 my $xslt_txt = $self->{'xslt_txt'};
359 my $fedora_namespace = $self->{'fedora_namespace'};
360
361 if ($inexport_mode eq "import") {
362 print STDERR "<Import>\n" if $gli;
363 }
364 else {
365 print STDERR "<export>\n" if $gli;
366 }
367
368 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
369 if ($self->{'manifest'} ne "") {
370 my $manifest_filename = $self->{'manifest'};
371
372 if (!&util::filename_is_absolute($manifest_filename)) {
373 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
374 }
375
376 $self->{'manifest'} =~ s/[\\\/]+/\//g;
377 $self->{'manifest'} =~ s/\/$//;
378
379 $manifest_lookup->parse($manifest_filename);
380 }
381
382 my $manifest = $self->{'manifest'};
383
384 # load all the plugins
385 my $plugins = [];
386 if (defined $collectcfg->{'plugin'}) {
387 $plugins = $collectcfg->{'plugin'};
388 }
389
390 #some global options for the plugins
391 my @global_opts = ();
392
393 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
394 if (scalar(@$pluginfo) == 0) {
395 &gsprintf($out, "{import.no_plugins_loaded}\n");
396 die "\n";
397 }
398
399 # remove the old contents of the archives directory (and tmp directory) if needed
400 if ($removeold) {
401 if (-e $archivedir) {
402 &gsprintf($out, "{import.removing_archives}\n");
403 &util::rm_r ($archivedir);
404 }
405 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
406 $tmpdir =~ s/[\\\/]+/\//g;
407 $tmpdir =~ s/\/$//;
408 if (-e $tmpdir) {
409 &gsprintf($out, "{import.removing_tmpdir}\n");
410 &util::rm_r ($tmpdir);
411 }
412 }
413
414 # create the archives dir if needed
415 &util::mk_all_dir($archivedir);
416
417 # read the archive information file
418## my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
419
420 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
421 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
422 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
423
424 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
425 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
426
427 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
428 $archive_info->load_info ($arcinfo_doc_filename);
429
430 if ($manifest eq "") {
431 # Load in list of files in import folder from last import (if present)
432 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
433 }
434
435 ####Use Plugout####
436 my $plugout;
437
438 if ($inexport_mode eq "import") {
439 if (defined $collectcfg->{'plugout'}) {
440 # If a plugout was specified in the collect.cfg file, assume it is sensible
441 # We can't check the name because it could be anything, if it is a custom plugout
442 $plugout = $collectcfg->{'plugout'};
443 }
444 else{
445 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
446 push @$plugout,"GreenstoneXMLPlugout";
447 }
448 else{
449 push @$plugout,$saveas."Plugout";
450 }
451 }
452 }
453 else {
454 if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
455 $plugout = $collectcfg->{'plugout'};
456 }
457 else{
458 if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
459 push @$plugout,"GreenstoneMETSPlugout";
460 }
461 else{
462 push @$plugout,$saveas."Plugout";
463 }
464 }
465 }
466
467 my $plugout_name = $plugout->[0];
468
469 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
470 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
471 push @$plugout,("-debug") if ($debug);
472 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
473 push @$plugout,("-gzip_output") if ($gzip);
474 push @$plugout,("-output_handle",$out) if (defined $out);
475
476 push @$plugout,("-xslt_file",$xsltfile) if (defined $xsltfile && $xsltfile ne "");
477
478 if ($plugout_name =~ m/^MARCXMLPlugout$/) {
479 push @$plugout,("-group") if ($group_marc);
480 push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
481 }
482 if ($plugout_name =~ m/^.*METSPlugout$/) {
483 push @$plugout,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $xslt_mets ne "");
484 push @$plugout,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $xslt_txt ne "");
485 }
486
487 if ($plugout_name eq "FedoraMETSPlugout") {
488 push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
489 }
490
491
492 my $processor = &plugout::load_plugout($plugout);
493 $processor->setoutputdir ($archivedir);
494 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
495 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
496
497 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
498
499 if ($removeold) {
500 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
501 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
502 }
503 if ($manifest eq "") {
504 # process the import directory
505 my $block_hash = {};
506 my $metadata = {};
507 # gobal blocking pass may set up some metadata
508 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
509
510 if ($incremental || $incremental_mode eq "onlyadd") {
511
512 prime_doc_oid_count($archivedir);
513
514 # Can now work out which files were new, already existed, and have
515 # been deleted
516
517 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
518 $archivedir,$verbosity,$incremental_mode);
519
520 my @new_files = sort keys %{$block_hash->{'new_files'}};
521 if (scalar(@new_files>0)) {
522 print STDERR "New files and modified metadata files since last import:\n ";
523 print STDERR join("\n ",@new_files), "\n";
524 }
525
526 if ($incremental) {
527 # only look for deletions if we are truely incremental
528 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
529 # Filter out any in gsdl/tmp area
530 my @filtered_deleted_files = ();
531 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
532 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
533 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
534 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
535
536 foreach my $df (@deleted_files) {
537 next if ($df =~ m/^$gsdl_tmp_area/);
538 next if ($df =~ m/^$collect_tmp_area/);
539
540 push(@filtered_deleted_files,$df);
541 }
542
543
544 @deleted_files = @filtered_deleted_files;
545
546 if (scalar(@deleted_files)>0) {
547 print STDERR "Files deleted since last import:\n ";
548 print STDERR join("\n ",@deleted_files), "\n";
549
550
551 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
552
553 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
554 }
555
556 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
557
558 if (scalar(@reindex_files)>0) {
559 print STDERR "Files to reindex since last import:\n ";
560 print STDERR join("\n ",@reindex_files), "\n";
561 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
562 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
563 }
564
565 }
566
567 # Play it safe, and run through the entire folder, only processing new or edited files
568 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
569
570 }
571 else {
572 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
573 }
574
575 }
576 else
577 {
578 #
579 # 1. Process delete files first
580 #
581
582 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
583 my @full_deleted_files = ();
584
585 # ensure all filenames are absolute
586 foreach my $df (@deleted_files) {
587 my $full_df =
588 (&util::filename_is_absolute($df))
589 ? $df
590 : &util::filename_cat($importdir,$df);
591
592 push(@full_deleted_files,$full_df);
593 }
594
595 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
596 mark_docs_for_deletion($archive_info,{},
597 \@full_deleted_files,
598 $archivedir, $verbosity, "delete");
599
600
601 #
602 # 2. Now files for reindexing
603 #
604
605 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
606 my @full_reindex_files = ();
607
608 # ensure all filenames are absolute
609 foreach my $rf (@reindex_files) {
610 my $full_rf =
611 (&util::filename_is_absolute($rf))
612 ? $rf
613 : &util::filename_cat($importdir,$rf);
614
615 push(@full_reindex_files,$full_rf);
616 }
617
618 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
619 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
620
621 # And now ensure the new version of the file processed by appropriate
622 # plugin
623 foreach my $full_rf (@full_reindex_files) {
624 &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
625 }
626
627
628 #
629 # 3. Now finally any new files
630 #
631
632 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
633 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
634 }
635
636
637 }
638
639 if ($saveas eq "FedoraMETS") {
640 # create collection "doc obj" for Fedora that contains
641 # collection-level metadata
642
643 my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
644 $doc_obj->set_OID("collection");
645
646 my $col_name = undef;
647 my $col_meta = $collectcfg->{'collectionmeta'};
648
649 if (defined $col_meta) {
650 store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
651 store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description
652 }
653 $processor->process($doc_obj);
654 }
655
656 &plugin::end($pluginfo, $processor);
657
658 &plugin::deinit($pluginfo, $processor);
659
660 # Store the value of OIDCount (used in doc.pm) so it can be
661 # restored correctly to this value on an incremental build
662 store_doc_oid_count($archivedir);
663
664 # write out the archive information file
665 $processor->close_file_output() if $groupsize > 1;
666 $processor->close_group_output() if $processor->is_group();
667
668 # for backwards compatability with archvies.inf file
669 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
670 $archive_info->save_info($arcinfo_doc_filename);
671 }
672 else {
673 $archive_info->save_revinfo_db($arcinfo_src_filename);
674 }
675
676 return $pluginfo;
677}
678
679
680sub generate_statistics
681{
682 my $self = shift @_;
683 my ($pluginfo) = @_;
684
685 my $inexport_mode = $self->{'mode'};
686
687 my $statsfile = $self->{'statsfile'};
688 my $out = $self->{'out'};
689 my $faillogname = $self->{'faillogname'};
690 my $gli = $self->{'gli'};
691
692 # write out import stats
693 my $close_stats = 0;
694 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
695 if (open (STATS, ">$statsfile")) {
696 $statsfile = 'import::STATS';
697 $close_stats = 1;
698 } else {
699 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
700 &gsprintf($out, "{import.stats_backup}\n");
701 $statsfile = 'STDERR';
702 }
703 }
704
705 &gsprintf($out, "\n");
706 &gsprintf($out, "*********************************************\n");
707 &gsprintf($out, "{$inexport_mode.complete}\n");
708 &gsprintf($out, "*********************************************\n");
709
710 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
711 if ($close_stats) {
712 close STATS;
713 }
714
715 close OUT if $self->{'close_out'};
716 close FAILLOG;
717}
718
719
720
721
722
723
724
725sub oid_count_file {
726 my ($archivedir) = @_;
727 return &util::filename_cat ($archivedir, "OIDcount");
728}
729
730
731sub prime_doc_oid_count
732{
733 my ($archivedir) = @_;
734 my $oid_count_filename = &oid_count_file($archivedir);
735
736 if (-e $oid_count_filename) {
737 if (open(OIDIN,"<$oid_count_filename")) {
738 my $OIDcount = <OIDIN>;
739 chomp $OIDcount;
740 close(OIDIN);
741
742 $doc::OIDcount = $OIDcount;
743 }
744 else {
745
746 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
747 print STDERR "Setting value to 0\n";
748 }
749 }
750
751}
752
753sub store_doc_oid_count
754{
755 # Use the file "OIDcount" in the archives directory to record
756 # what value doc.pm got up to
757
758 my ($archivedir) = @_;
759 my $oid_count_filename = &oid_count_file($archivedir);
760
761
762 if (open(OIDOUT,">$oid_count_filename")) {
763 print OIDOUT $doc::OIDcount, "\n";
764
765 close(OIDOUT);
766 }
767 else {
768 print STDERR "Warning: unable to store document OID count\n";
769 }
770}
771
772
773
774sub new_vs_old_import_diff
775{
776 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
777
778 # Get the infodbtype value for this collection from the arcinfo object
779 my $infodbtype = $archive_info->{'infodbtype'};
780
781 # in this method, we want to know if metadata files are modified or not.
782 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
783
784 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
785
786 # First convert all files to absolute form
787 # This is to support the situation where the import folder is not
788 # the default
789
790 my $prev_all_files = $archive_info->{'prev_import_filelist'};
791 my $full_prev_all_files = {};
792
793 foreach my $prev_file (keys %$prev_all_files) {
794
795 if (!&util::filename_is_absolute($prev_file)) {
796 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
797 $full_prev_all_files->{$full_prev_file} = $prev_file;
798 }
799 else {
800 $full_prev_all_files->{$prev_file} = $prev_file;
801 }
802 }
803
804
805 # Figure out which are the new files, existing files and so
806 # by implication the files from the previous import that are not
807 # there any more => mark them for deletion
808 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
809
810 my $full_curr_file = $curr_file;
811
812 # entry in 'all_files' is moved to either 'existing_files',
813 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
814
815 if (!&util::filename_is_absolute($curr_file)) {
816 # add in import dir to make absolute
817 $full_curr_file = &util::filename_cat($importdir,$curr_file);
818 }
819
820 # figure out if new file or not
821 if (defined $full_prev_all_files->{$full_curr_file}) {
822 # delete it so that only files that need deleting are left
823 delete $full_prev_all_files->{$full_curr_file};
824
825 # had it before. is it a metadata file?
826 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
827
828 # is it modified??
829 if (-M $full_curr_file < $archiveinf_timestamp) {
830 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
831 # its newer than last build
832 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
833 }
834 }
835 else {
836 if ($incremental_mode eq "all") {
837
838 # had it before
839 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
840
841 }
842 else {
843 # Warning in "onlyadd" mode, but had it before!
844 print STDERR "Warning: File $full_curr_file previously imported.\n";
845 print STDERR " Treating as new file\n";
846
847 $block_hash->{'new_files'}->{$full_curr_file} = 1;
848
849 }
850 }
851 }
852 else {
853 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
854 # the new file is the special sort of file greenstone uses
855 # to attach metadata to src documents
856 # i.e metadata.xml
857 # (but note, the filename used is not constrained in
858 # Greenstone to always be this)
859
860 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
861 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
862 }
863 else {
864 $block_hash->{'new_files'}->{$full_curr_file} = 1;
865 }
866 }
867
868
869 delete $block_hash->{'all_files'}->{$curr_file};
870 }
871
872
873
874
875 # Deal with complication of new or modified metadata files by forcing
876 # everything from this point down in the file hierarchy to
877 # be freshly imported.
878 #
879 # This may mean files that have not changed are reindexed, but does
880 # guarantee by the end of processing all new metadata is correctly
881 # associated with the relevant document(s).
882
883 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
884 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
885
886 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
887 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
888
889 # Go through existing_files, and mark anything that is contained
890 # within 'situated_dir' to be reindexed (in case some of the metadata
891 # attaches to one of these files)
892
893 my $reindex_files = [];
894
895 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
896
897 if ($existing_f =~ m/^$situated_dir/) {
898 push(@$reindex_files,$existing_f);
899 $block_hash->{'reindex_files'}->{$existing_f} = 1;
900 delete $block_hash->{'existing_files'}->{$existing_f};
901
902 }
903 }
904
905 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
906 # (or equivalent)
907 $block_hash->{'new_files'}->{$new_mdf} = 1;
908
909 }
910
911 # go through remaining existing files and work out what has changed and needs to be reindexed.
912 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
913
914 my $reindex_files = [];
915
916 foreach my $existing_filename (@existing_files) {
917 if (-M $existing_filename < $archiveinf_timestamp) {
918 # file is newer than last build
919
920 my $existing_file = $existing_filename;
921 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
922
923 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
924 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
925
926 print STDERR "**** Reindexing existing file: $existing_file\n";
927
928 push(@$reindex_files,$existing_file);
929 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
930 }
931
932 }
933
934
935 # By this point full_prev_all_files contains the files
936 # mentioned in archiveinf-src.db but are not in the 'import'
937 # folder (or whatever was specified through -importdir ...)
938
939 # This list can contain files that were created in the 'tmp' or
940 # 'cache' areas (such as screen-size and thumbnail images).
941 #
942 # In building the final list of files to delete, we test to see if
943 # it exists on the filesystem and if it does (unusual for a "normal"
944 # file in import, but possible in the case of 'tmp' files),
945 # supress it from going into the final list
946
947 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
948
949 my @deleted_files = values %$full_prev_all_files;
950 map { my $curr_file = $_;
951 my $full_curr_file = $curr_file;
952
953 if (!&util::filename_is_absolute($curr_file)) {
954 # add in import dir to make absolute
955
956 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
957 }
958
959
960 if (!-e $full_curr_file) {
961 $block_hash->{'deleted_files'}->{$curr_file} = 1;
962 }
963 } @deleted_files;
964
965
966
967}
968
969
970# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
971# $mode is 'delete' or 'reindex'
972sub mark_docs_for_deletion
973{
974 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
975
976 my $mode_text = "deleted from index";
977 if ($mode eq "reindex") {
978 $mode_text = "reindexed";
979 }
980
981 # Get the infodbtype value for this collection from the arcinfo object
982 my $infodbtype = $archive_info->{'infodbtype'};
983
984 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
985 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
986
987
988 # record files marked for deletion in arcinfo
989 foreach my $file (@$deleted_files) {
990 # use 'archiveinf-src' info database file to look up all the OIDs
991 # that this file is used in (note in most cases, it's just one OID)
992
993 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
994 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
995 my $oids = $src_rec->{'oid'};
996 my $file_record_deleted = 0;
997
998 # delete the src record
999 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1000 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1001 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1002
1003
1004 foreach my $oid (@$oids) {
1005
1006 # find the source doc (the primary file that becomes this oid)
1007 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1008 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
1009 my $doc_source_file = $doc_rec->{'src-file'}->[0];
1010 if (!&util::filename_is_absolute($doc_source_file)) {
1011 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1012 }
1013
1014 if ($doc_source_file ne $file) {
1015 # its an associated or metadata file
1016
1017 # mark source doc for reimport as one of its assoc files has changed or deleted
1018 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1019
1020 }
1021 my $curr_status = $archive_info->get_status_info($oid);
1022 if (defined($curr_status) && (($curr_status ne "D"))) {
1023 if ($verbosity>1) {
1024 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1025 }
1026 # mark oid for deletion (it will be deleted or reimported)
1027 $archive_info->set_status_info($oid,"D");
1028 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1029 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1030
1031 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1032 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1033
1034 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1035 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1036 }
1037 }
1038
1039 }
1040 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves.
1041 foreach my $file (@$deleted_files) {
1042 if (defined $block_hash->{'reindex_files'}->{$file}) {
1043 delete $block_hash->{'reindex_files'}->{$file};
1044 }
1045 }
1046
1047
1048}
1049
1050
1051
10521;
Note: See TracBrowser for help on using the repository browser.