source: main/trunk/greenstone2/perllib/inexport.pm@ 22413

Last change on this file since 22413 was 22413, checked in by davidb, 14 years ago

Initial pass at getting the main code to import.pl (and the very similar export.pl) structured as a shared module

  • Property svn:executable set to *
File size: 30.6 KB
RevLine 
[18457]1###########################################################################
2#
[22413]3# inexport.pm -- useful class to support import.pl and export.pl
[18457]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
[22413]30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
[19789]32
[22413]33use arcinfo;
34use colcfg;
[21553]35use dbutil;
[22413]36use plugin;
37use plugout;
38use manifest;
39use inexport;
40use dbutil;
[18457]41use util;
[22413]42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
[18457]47
[22413]48use File::Basename;
[21563]49
[22413]50sub new
51{
52 my $class = shift (@_);
53 my ($argv,$options) = @_;
54
55 my $self = { 'xml' => 0 };
56
57 # general options available to all plugins
58 my $arguments = $options->{'args'};
59 my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60 # Parse returns -1 if something has gone wrong
61 if ($intArgLeftinAfterParsing == -1)
62 {
63 &PrintUsage::print_txt_usage($options, "{import.params}");
64 die "\n";
65 }
66
67 my $language = $self->{'language'};
68 # If $language has been specified, load the appropriate resource bundle
69 # (Otherwise, the default resource bundle will be loaded automatically)
70 if ($language && $language =~ /\S/) {
71 &gsprintf::load_language_specific_resource_bundle($language);
72 }
73
74 if ($self->{'xml'}) {
75 &PrintUsage::print_xml_usage($options);
76 print "\n";
77 return;
78 }
79
80 if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
81 &gsprintf::output_strings_in_UTF8;
82 }
83
84 # now check that we had exactly one leftover arg, which should be
85 # the collection name. We don't want to do this earlier, cos
86 # -xml arg doesn't need a collection name
87 # Or if the user specified -h, then we output the usage also
88 if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
89 {
90 &PrintUsage::print_txt_usage($options, "{import.params}");
91 die "\n";
92 }
93
94 $self->{'close_out'} = 0;
95 my $out = $self->{'out'};
96 if ($out !~ /^(STDERR|STDOUT)$/i) {
97 open (OUT, ">$out") ||
98 (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
99 $out = 'import::OUT';
100 $self->{'close_out'} = 1;
101 }
102 $out->autoflush(1);
103 $self->{'out'} = $out;
104
105 # @ARGV should be only one item, the name of the collection
106 $self->{'collection'} = shift @$argv;
107
108 return bless $self, $class;
109}
110
111sub get_collection
112{
113 my $self = shift @_;
114
115 return $self->{'collection'};
116}
117
118
119sub read_collection_cfg
120{
121 my $self = shift @_;
122 my ($collection,$options) = @_;
123
124 my $collectdir = $self->{'collectdir'};
125 my $site = $self->{'site'};
126 my $out = $self->{'out'};
127
128 if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
129 &PrintUsage::print_txt_usage($options, "{import.params}");
130 die "\n";
131 }
132
133 # add collection's perllib dir into include path in
134 # case we have collection specific modules
135 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
136
137 # check that we can open the faillog
138 my $faillog = $self->{'faillog'};
139 if ($faillog eq "") {
140 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
141 }
142 open (FAILLOG, ">$faillog") ||
143 (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
144
145
146 my $faillogname = $faillog;
147 $faillog = 'inexport::FAILLOG';
148 $faillog->autoflush(1);
149 $self->{'faillog'} = $faillog;
150 $self->{'faillogname'} = $faillogname;
151
152 # Read in the collection configuration file.
153 my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
154 my $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
155
156 return $collectcfg;
157}
158
159sub set_collection_options
160{
161 my $self = shift @_;
162 my ($inexport_mode,$collectcfg) = @_;
163
164 my $verbosity = $self->{'verbosity'};
165 print STDERR "**** verbosity = $verbosity\n\n\n";
166
167 my $debug = $self->{'debug'};
168 my $importdir = $self->{'importdir'};
169 my $archivedir = $self->{'archivedir'};
170 my $out = $self->{'out'};
171
172 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
173 if (!defined($collectcfg->{'infodbtype'}))
174 {
175 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
176 }
177
178 if (defined $collectcfg->{'importdir'} && $importdir eq "") {
179 $importdir = $collectcfg->{'importdir'};
180 }
181 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
182 $archivedir = $collectcfg->{'archivedir'};
183 }
184 # fill in the default import and archives directories if none
185 # were supplied, turn all \ into / and remove trailing /
186 $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
187 $importdir =~ s/[\\\/]+/\//g;
188 $importdir =~ s/\/$//;
189 if (!-e $importdir) {
190 &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
191 die "\n";
192 }
193 $self->{'importdir'} = $importdir;
194
195 if ($archivedir eq "") {
196 if ($inexport_mode eq "import") {
197 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
198 }
199 elsif ($inexport_mode eq "export") {
200 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
201 }
202 else {
203 print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
204 print STDERR " Defaulting to 'archives' for file output\n";
205 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
206 }
207 }
208
209 $archivedir =~ s/[\\\/]+/\//g;
210 $archivedir =~ s/\/$//;
211 $self->{'archivedir'} = $archivedir;
212
213 if ($verbosity !~ /\d+/) {
214 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
215 $verbosity = $collectcfg->{'verbosity'};
216 } else {
217 $verbosity = 2; # the default
218 }
219 }
220 if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
221 $self->{'manifest'} = $collectcfg->{'manifest'};
222 }
223
224 if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
225 if ($collectcfg->{'gzip'} =~ /^true$/i) {
226 $self->{'gzip'} = 1;
227 }
228 }
229
230 if ($self->{'maxdocs'} !~ /\-?\d+/) {
231 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
232 $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
233 } else {
234 $self->{'maxdocs'} = -1; # the default
235 }
236 }
237 if ($self->{'groupsize'} == 1) {
238 if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
239 $self->{'groupsize'} = $collectcfg->{'groupsize'};
240 }
241 }
242
243 if (!defined $self->{'OIDtype'}
244 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
245 if (defined $collectcfg->{'OIDtype'}
246 && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
247 $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
248 } else {
249 $self->{'OIDtype'} = "hash"; # the default
250 }
251 }
252
253 if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
254 if (defined $collectcfg->{'OIDmetadata'}) {
255 $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
256 } else {
257 $self->{'OIDmetadata'} = "dc.Identifier"; # the default
258 }
259 }
260
261 my $sortmeta = $self->{'sortmeta'};
262 if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
263 $sortmeta = $collectcfg->{'sortmeta'};
264 }
265 # sortmeta cannot be used with group size
266 $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
267 if (defined $sortmeta && $self->{'groupsize'} > 1) {
268 &gsprintf($out, "{import.cannot_sort}\n\n");
269 $sortmeta = undef;
270 }
271 $self->{'sortmeta'} = $sortmeta;
272
273 if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
274 $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
275 }
276
277 if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
278 $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
279 }
280 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
281 $self->{'debug'} = 1;
282 }
283 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
284 $self->{'gli'} = 1;
285 }
286 $self->{'gli'} = 0 unless defined $self->{'gli'};
287
288 # check keepold and removeold
289 my ($removeold, $keepold, $incremental, $incremental_mode)
290 = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
291 $self->{'incremental'}, "archives",
292 $collectcfg);
293
294 $self->{'removeold'} = $removeold;
295 $self->{'keepold'} = $keepold;
296 $self->{'incremental'} = $incremental;
297 $self->{'incremental_mode'} = $incremental_mode;
298}
299
300sub process_files
301{
302 my $self = shift @_;
303 my ($inexport_mode,$collectcfg) = @_;
304
305 my $verbosity = $self->{'verbosity'};
306 my $debug = $self->{'debug'};
307
308 my $importdir = $self->{'importdir'};
309 my $archivedir = $self->{'archivedir'};
310
311 my $incremental = $self->{'incremental'};
312 my $incremental_mode = $self->{'incremental_mode'};
313
314 my $removeold = $self->{'removeold'};
315 my $keepold = $self->{'keepold'};
316
317 my $saveas = $self->{'saveas'};
318 my $OIDtype = $self->{'OIDtype'};
319 my $OIDmetadata = $self->{'OIDmetadata'};
320
321 my $out = $self->{'out'};
322 my $faillog = $self->{'faillog'};
323
324 my $maxdocs = $self->{'maxdocs'};
325 my $gzip = $self->{'gzip'};
326 my $groupsize = $self->{'groupsize'};
327 my $sortmeta = $self->{'sortmeta'};
328
329 my $removeprefix = $self->{'removeprefix'};
330 my $removesuffix = $self->{'removesuffix'};
331
332 my $gli = $self->{'gli'};
333
334 print STDERR "<Import>\n" if $gli;
335
336 my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
337 if ($self->{'manifest'} ne "") {
338 my $manifest_filename = $self->{'manifest'};
339
340 if (!&util::filename_is_absolute($manifest_filename)) {
341 $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
342 }
343
344 $self->{'manifest'} =~ s/[\\\/]+/\//g;
345 $self->{'manifest'} =~ s/\/$//;
346
347 $manifest_lookup->parse($manifest_filename);
348 }
349
350 my $manifest = $self->{'manifest'};
351
352 # load all the plugins
353 my $plugins = [];
354 if (defined $collectcfg->{'plugin'}) {
355 $plugins = $collectcfg->{'plugin'};
356 }
357
358 #some global options for the plugins
359 my @global_opts = ();
360
361
362 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
363 if (scalar(@$pluginfo) == 0) {
364 &gsprintf($out, "{import.no_plugins_loaded}\n");
365 die "\n";
366 }
367
368 # remove the old contents of the archives directory (and tmp directory) if needed
369 if ($removeold) {
370 if (-e $archivedir) {
371 &gsprintf($out, "{import.removing_archives}\n");
372 &util::rm_r ($archivedir);
373 }
374 my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
375 $tmpdir =~ s/[\\\/]+/\//g;
376 $tmpdir =~ s/\/$//;
377 if (-e $tmpdir) {
378 &gsprintf($out, "{import.removing_tmpdir}\n");
379 &util::rm_r ($tmpdir);
380 }
381 }
382
383 # create the archives dir if needed
384 &util::mk_all_dir($archivedir);
385
386 # read the archive information file
387## my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
388
389 # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
390 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
391 &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
392
393 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
394 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
395
396 my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
397 $archive_info->load_info ($arcinfo_doc_filename);
398
399 if ($manifest eq "") {
400 # Load in list of files in import folder from last import (if present)
401 $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
402 }
403
404 ####Use Plugout####
405 my ($plugout);
406 if (defined $collectcfg->{'plugout'}) {
407 # If a plugout was specified in the collect.cfg file, assume it is sensible
408 # We can't check the name because it could be anything, if it is a custom plugout
409 $plugout = $collectcfg->{'plugout'};
410 }
411 else{
412 if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
413 push @$plugout,"GreenstoneXMLPlugout";
414 }
415 else{
416 push @$plugout,$saveas."Plugout";
417 }
418 }
419
420 push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
421 push @$plugout,("-verbosity",$verbosity) if (defined $verbosity);
422 push @$plugout,("-gzip_output") if ($gzip);
423 push @$plugout,("-group_size",$groupsize) if (defined $groupsize);
424 push @$plugout,("-output_handle",$out) if (defined);
425 push @$plugout,("-debug") if ($debug);
426
427 my $processor = &plugout::load_plugout($plugout);
428 $processor->setoutputdir ($archivedir);
429 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
430 $processor->set_OIDtype ($OIDtype, $OIDmetadata);
431
432 &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
433
434 if ($removeold) {
435 # occasionally, plugins may want to do something on remove old, eg pharos image indexing
436 &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
437 }
438 if ($manifest eq "") {
439 # process the import directory
440 my $block_hash = {};
441 my $metadata = {};
442 # gobal blocking pass may set up some metadata
443 &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
444
445
446 if ($incremental || $incremental_mode eq "onlyadd") {
447
448 prime_doc_oid_count($archivedir);
449
450
451 # Can now work out which files were new, already existed, and have
452 # been deleted
453
454 new_vs_old_import_diff($archive_info,$block_hash,$importdir,
455 $archivedir,$verbosity,$incremental_mode);
456
457 my @new_files = sort keys %{$block_hash->{'new_files'}};
458 if (scalar(@new_files>0)) {
459 print STDERR "New files and modified metadata files since last import:\n ";
460 print STDERR join("\n ",@new_files), "\n";
461 }
462
463 if ($incremental) {
464 # only look for deletions if we are truely incremental
465 my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
466 # Filter out any in gsdl/tmp area
467 my @filtered_deleted_files = ();
468 my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
469 my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
470 $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
471 $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
472
473 foreach my $df (@deleted_files) {
474 next if ($df =~ m/^$gsdl_tmp_area/);
475 next if ($df =~ m/^$collect_tmp_area/);
476
477 push(@filtered_deleted_files,$df);
478 }
479
480
481 @deleted_files = @filtered_deleted_files;
482
483 if (scalar(@deleted_files)>0) {
484 print STDERR "Files deleted since last import:\n ";
485 print STDERR join("\n ",@deleted_files), "\n";
486
487
488 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
489
490 mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
491 }
492
493 my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
494
495 if (scalar(@reindex_files)>0) {
496 print STDERR "Files to reindex since last import:\n ";
497 print STDERR join("\n ",@reindex_files), "\n";
498 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
499 mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
500 }
501
502 }
503
504 # Play it safe, and run through the entire folder, only processing new or edited files
505 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
506
507 }
508 else {
509 &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
510 }
511
512 }
513 else
514 {
515 #
516 # 1. Process delete files first
517 #
518
519 my @deleted_files = keys %{$manifest_lookup->{'delete'}};
520 my @full_deleted_files = ();
521
522 # ensure all filenames are absolute
523 foreach my $df (@deleted_files) {
524 my $full_df =
525 (&util::filename_is_absolute($df))
526 ? $df
527 : &util::filename_cat($importdir,$df);
528
529 push(@full_deleted_files,$full_df);
530 }
531
532 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
533 mark_docs_for_deletion($archive_info,{},
534 \@full_deleted_files,
535 $archivedir, $verbosity, "delete");
536
537
538 #
539 # 2. Now files for reindexing
540 #
541
542 my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
543 my @full_reindex_files = ();
544
545 # ensure all filenames are absolute
546 foreach my $rf (@reindex_files) {
547 my $full_rf =
548 (&util::filename_is_absolute($rf))
549 ? $rf
550 : &util::filename_cat($importdir,$rf);
551
552 push(@full_reindex_files,$full_rf);
553 }
554
555 &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
556 mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
557
558 # And now ensure the new version of the file processed by appropriate
559 # plugin
560 foreach my $full_rf (@full_reindex_files) {
561 &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
562 }
563
564
565 #
566 # 3. Now finally any new files
567 #
568
569 foreach my $file (keys %{$manifest_lookup->{'index'}}) {
570 &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
571 }
572
573
574 }
575
576 &plugin::end($pluginfo, $processor);
577
578 &plugin::deinit($pluginfo, $processor);
579
580 # Store the value of OIDCount (used in doc.pm) so it can be
581 # restored correctly to this value on an incremental build
582 store_doc_oid_count($archivedir);
583
584 # write out the archive information file
585 $processor->close_file_output() if $groupsize > 1;
586 $processor->close_group_output() if $processor->is_group();
587
588# The following 'if' statement is in the export.pl version of the script,
589# The reason for the 'if' statement is now given in export.pl
590# Unclear at this point if the same should be done here
591## if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
592 # Not all export types need this (e.g. DSpace)
593
594 # should we still do this in debug mode??
595
596 # for backwards compatability with archvies.inf file
597 if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
598 $archive_info->save_info($arcinfo_doc_filename);
599 }
600 else {
601 $archive_info->save_revinfo_db($arcinfo_src_filename);
602 }
603
604
605## }
606
607 return $pluginfo;
608}
609
610
611sub generate_statistics
612{
613 my $self = shift @_;
614 my ($inexport_mode,$pluginfo) = @_;
615
616 my $statsfile = $self->{'statsfile'};
617 my $out = $self->{'out'};
618 my $faillogname = $self->{'faillogname'};
619 my $gli = $self->{'gli'};
620
621 # write out import stats
622 my $close_stats = 0;
623 if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
624 if (open (STATS, ">$statsfile")) {
625 $statsfile = 'import::STATS';
626 $close_stats = 1;
627 } else {
628 &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
629 &gsprintf($out, "{import.stats_backup}\n");
630 $statsfile = 'STDERR';
631 }
632 }
633
634 &gsprintf($out, "\n");
635 &gsprintf($out, "*********************************************\n");
636 &gsprintf($out, "{import.complete}\n");
637 &gsprintf($out, "*********************************************\n");
638
639 &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
640 if ($close_stats) {
641 close STATS;
642 }
643
644 close OUT if $self->{'close_out'};
645 close FAILLOG;
646}
647
648
649
650
651
652
653
[21306]654sub oid_count_file {
655 my ($archivedir) = @_;
656 return &util::filename_cat ($archivedir, "OIDcount");
657}
658
659
[18528]660sub prime_doc_oid_count
661{
662 my ($archivedir) = @_;
[21306]663 my $oid_count_filename = &oid_count_file($archivedir);
[18528]664
665 if (-e $oid_count_filename) {
666 if (open(OIDIN,"<$oid_count_filename")) {
667 my $OIDcount = <OIDIN>;
668 chomp $OIDcount;
669 close(OIDIN);
670
671 $doc::OIDcount = $OIDcount;
672 }
673 else {
674
675 print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
676 print STDERR "Setting value to 0\n";
677 }
678 }
679
680}
681
682sub store_doc_oid_count
683{
684 # Use the file "OIDcount" in the archives directory to record
685 # what value doc.pm got up to
686
687 my ($archivedir) = @_;
[21306]688 my $oid_count_filename = &oid_count_file($archivedir);
[18528]689
690
691 if (open(OIDOUT,">$oid_count_filename")) {
692 print OIDOUT $doc::OIDcount, "\n";
693
694 close(OIDOUT);
695 }
696 else {
697 print STDERR "Warning: unable to store document OID count\n";
698 }
699}
700
701
702
[18457]703sub new_vs_old_import_diff
704{
[20578]705 my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
[18457]706
[21620]707 # Get the infodbtype value for this collection from the arcinfo object
708 my $infodbtype = $archive_info->{'infodbtype'};
709
[20776]710 # in this method, we want to know if metadata files are modified or not.
[21620]711 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
[20776]712
713 my $archiveinf_timestamp = -M $arcinfo_doc_filename;
714
[18457]715 # First convert all files to absolute form
716 # This is to support the situation where the import folder is not
717 # the default
718
719 my $prev_all_files = $archive_info->{'prev_import_filelist'};
720 my $full_prev_all_files = {};
721
722 foreach my $prev_file (keys %$prev_all_files) {
723
724 if (!&util::filename_is_absolute($prev_file)) {
725 my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
726 $full_prev_all_files->{$full_prev_file} = $prev_file;
727 }
728 else {
729 $full_prev_all_files->{$prev_file} = $prev_file;
730 }
731 }
732
[18469]733
[18457]734 # Figure out which are the new files, existing files and so
735 # by implication the files from the previous import that are not
736 # there any more => mark them for deletion
737 foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
738
739 my $full_curr_file = $curr_file;
740
741 # entry in 'all_files' is moved to either 'existing_files',
[20776]742 # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
[18457]743
744 if (!&util::filename_is_absolute($curr_file)) {
745 # add in import dir to make absolute
746 $full_curr_file = &util::filename_cat($importdir,$curr_file);
747 }
748
[19498]749 # figure out if new file or not
[18457]750 if (defined $full_prev_all_files->{$full_curr_file}) {
[20776]751 # delete it so that only files that need deleting are left
752 delete $full_prev_all_files->{$full_curr_file};
753
754 # had it before. is it a metadata file?
755 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
[20578]756
[20776]757 # is it modified??
758 if (-M $full_curr_file < $archiveinf_timestamp) {
759 print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
760 # its newer than last build
761 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
762 }
[20578]763 }
764 else {
[20776]765 if ($incremental_mode eq "all") {
766
767 # had it before
768 $block_hash->{'existing_files'}->{$full_curr_file} = 1;
769
770 }
771 else {
772 # Warning in "onlyadd" mode, but had it before!
773 print STDERR "Warning: File $full_curr_file previously imported.\n";
774 print STDERR " Treating as new file\n";
775
776 $block_hash->{'new_files'}->{$full_curr_file} = 1;
777
778 }
[20578]779 }
780 }
781 else {
782 if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
783 # the new file is the special sort of file greenstone uses
784 # to attach metadata to src documents
785 # i.e metadata.xml
786 # (but note, the filename used is not constrained in
787 # Greenstone to always be this)
[18457]788
[20776]789 print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
790 $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
[20578]791 }
792 else {
793 $block_hash->{'new_files'}->{$full_curr_file} = 1;
794 }
[18457]795 }
[20578]796
[18457]797
798 delete $block_hash->{'all_files'}->{$curr_file};
799 }
800
[20578]801
[21306]802
803
[20776]804 # Deal with complication of new or modified metadata files by forcing
[20578]805 # everything from this point down in the file hierarchy to
806 # be freshly imported.
807 #
808 # This may mean files that have not changed are reindexed, but does
809 # guarantee by the end of processing all new metadata is correctly
810 # associated with the relevant document(s).
811
[20776]812 foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
[20578]813 my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
814
815 $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
[20769]816 $situated_dir =~ s/\\/\\\\/g; # need to protect windows slash \ in regular expression
817
[20578]818 # Go through existing_files, and mark anything that is contained
819 # within 'situated_dir' to be reindexed (in case some of the metadata
820 # attaches to one of these files)
821
822 my $reindex_files = [];
823
824 foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
[20769]825
[20578]826 if ($existing_f =~ m/^$situated_dir/) {
827 push(@$reindex_files,$existing_f);
828 $block_hash->{'reindex_files'}->{$existing_f} = 1;
[21306]829 delete $block_hash->{'existing_files'}->{$existing_f};
[20578]830
831 }
832 }
833
834 # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
835 # (or equivalent)
836 $block_hash->{'new_files'}->{$new_mdf} = 1;
837
838 }
839
[21306]840 # go through remaining existing files and work out what has changed and needs to be reindexed.
841 my @existing_files = sort keys %{$block_hash->{'existing_files'}};
842
843 my $reindex_files = [];
844
845 foreach my $existing_filename (@existing_files) {
846 if (-M $existing_filename < $archiveinf_timestamp) {
847 # file is newer than last build
848
849 my $existing_file = $existing_filename;
850 #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
851
852 #my $collectdir_resafe = &util::filename_to_regex($collectdir);
853 #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
854
855 print STDERR "**** Reindexing existing file: $existing_file\n";
856
857 push(@$reindex_files,$existing_file);
858 $block_hash->{'reindex_files'}->{$existing_filename} = 1;
859 }
860
861 }
862
[20578]863
[18469]864 # By this point full_prev_all_files contains the files
865 # mentioned in archiveinf-src.db but are not in the 'import'
866 # folder (or whatever was specified through -importdir ...)
867
868 # This list can contain files that were created in the 'tmp' or
869 # 'cache' areas (such as screen-size and thumbnail images).
[18457]870 #
[18469]871 # In building the final list of files to delete, we test to see if
[20578]872 # it exists on the filesystem and if it does (unusual for a "normal"
873 # file in import, but possible in the case of 'tmp' files),
874 # supress it from going into the final list
[18469]875
876 my $collectdir = $ENV{'GSDLCOLLECTDIR'};
877
[18457]878 my @deleted_files = values %$full_prev_all_files;
[18469]879 map { my $curr_file = $_;
880 my $full_curr_file = $curr_file;
881
882 if (!&util::filename_is_absolute($curr_file)) {
883 # add in import dir to make absolute
884
885 $full_curr_file = &util::filename_cat($collectdir,$curr_file);
886 }
887
888
889 if (!-e $full_curr_file) {
890 $block_hash->{'deleted_files'}->{$curr_file} = 1;
891 }
892 } @deleted_files;
[20578]893
894
895
[18457]896}
897
[19498]898
[20788]899# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
[21306]900# $mode is 'delete' or 'reindex'
901sub mark_docs_for_deletion
[18457]902{
[21306]903 my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
[18457]904
[21306]905 my $mode_text = "deleted from index";
906 if ($mode eq "reindex") {
907 $mode_text = "reindexed";
908 }
[18457]909
[21620]910 # Get the infodbtype value for this collection from the arcinfo object
911 my $infodbtype = $archive_info->{'infodbtype'};
[18457]912
[21620]913 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
914 my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
915
[22010]916
[18457]917 # record files marked for deletion in arcinfo
[19498]918 foreach my $file (@$deleted_files) {
[21564]919 # use 'archiveinf-src' info database file to look up all the OIDs
[19789]920 # that this file is used in (note in most cases, it's just one OID)
[18457]921
[21620]922 my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
[21554]923 my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
[18457]924 my $oids = $src_rec->{'oid'};
[20776]925 my $file_record_deleted = 0;
[20788]926
927 # delete the src record
[22010]928 my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
[21620]929 &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
[22010]930 &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
931
932
[18457]933 foreach my $oid (@$oids) {
934
[20788]935 # find the source doc (the primary file that becomes this oid)
[21620]936 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[21554]937 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
[20776]938 my $doc_source_file = $doc_rec->{'src-file'}->[0];
939 if (!&util::filename_is_absolute($doc_source_file)) {
940 $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
941 }
[18457]942
[20788]943 if ($doc_source_file ne $file) {
944 # its an associated or metadata file
945
[20776]946 # mark source doc for reimport as one of its assoc files has changed or deleted
947 $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
[20788]948
[18457]949 }
[20788]950 my $curr_status = $archive_info->get_status_info($oid);
951 if (defined($curr_status) && (($curr_status ne "D"))) {
952 if ($verbosity>1) {
953 print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
[19498]954 }
[20788]955 # mark oid for deletion (it will be deleted or reimported)
956 $archive_info->set_status_info($oid,"D");
[21620]957 my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
[20788]958 $val =~ s/^<index-status>(.*)$/<index-status>D/m;
[21557]959
960 my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
[22010]961 my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
962
[21620]963 &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
[22010]964 &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
[19498]965 }
[18457]966 }
[22327]967
[18457]968 }
[22327]969 # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves.
970 foreach my $file (@$deleted_files) {
971 if (defined $block_hash->{'reindex_files'}->{$file}) {
972 delete $block_hash->{'reindex_files'}->{$file};
973 }
974 }
[21560]975
[22010]976
[18457]977}
978
979
[18554]980
[18457]9811;
Note: See TracBrowser for help on using the repository browser.