root/main/trunk/greenstone2/perllib/inexport.pm @ 22413

Revision 22413, 30.6 KB (checked in by davidb, 10 years ago)

Initial pass at getting the main code to import.pl (and the very similar export.pl) structured as a shared module

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use plugin;
37use plugout;
38use manifest;
39use inexport;
40use dbutil;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52    my $class = shift (@_);
53    my ($argv,$options) = @_;
54
55    my $self = { 'xml' => 0 };
56
57    # general options available to all plugins
58    my $arguments = $options->{'args'};
59    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60    # Parse returns -1 if something has gone wrong
61    if ($intArgLeftinAfterParsing == -1)
62    {
63    &PrintUsage::print_txt_usage($options, "{import.params}");
64    die "\n";
65    }
66   
67    my $language = $self->{'language'};
68    # If $language has been specified, load the appropriate resource bundle
69    # (Otherwise, the default resource bundle will be loaded automatically)
70    if ($language && $language =~ /\S/) {
71    &gsprintf::load_language_specific_resource_bundle($language);
72    }
73
74    if ($self->{'xml'}) {
75        &PrintUsage::print_xml_usage($options);
76    print "\n";
77    return;
78    }
79
80    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
81    &gsprintf::output_strings_in_UTF8;
82    }
83   
84    # now check that we had exactly one leftover arg, which should be
85    # the collection name. We don't want to do this earlier, cos
86    # -xml arg doesn't need a collection name
87    # Or if the user specified -h, then we output the usage also
88    if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
89    {
90    &PrintUsage::print_txt_usage($options, "{import.params}");
91    die "\n";
92    }
93
94    $self->{'close_out'} = 0;
95    my $out = $self->{'out'};
96    if ($out !~ /^(STDERR|STDOUT)$/i) {
97    open (OUT, ">$out") ||
98        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
99    $out = 'import::OUT';
100    $self->{'close_out'} = 1;
101    }
102    $out->autoflush(1);
103    $self->{'out'} = $out;
104
105    # @ARGV should be only one item, the name of the collection
106    $self->{'collection'} = shift @$argv;
107
108    return bless $self, $class;
109}
110
111sub get_collection
112{
113    my $self = shift @_;
114   
115    return $self->{'collection'};
116}
117
118
119sub read_collection_cfg
120{
121    my $self = shift @_;
122    my ($collection,$options) = @_;
123
124    my $collectdir = $self->{'collectdir'};
125    my $site       = $self->{'site'};
126    my $out        = $self->{'out'};
127
128    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
129    &PrintUsage::print_txt_usage($options, "{import.params}");
130    die "\n";
131    }
132
133    # add collection's perllib dir  into include path in
134    # case we have collection specific modules
135    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
136
137    # check that we can open the faillog
138    my $faillog = $self->{'faillog'};
139    if ($faillog eq "") {
140    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
141    }
142    open (FAILLOG, ">$faillog") ||
143    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
144
145   
146    my $faillogname = $faillog;
147    $faillog = 'inexport::FAILLOG';
148    $faillog->autoflush(1);
149    $self->{'faillog'} = $faillog;
150    $self->{'faillogname'} = $faillogname;
151
152    # Read in the collection configuration file.
153    my ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
154    my $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
155
156    return $collectcfg;
157}
158
159sub set_collection_options
160{
161    my $self = shift @_;
162    my ($inexport_mode,$collectcfg) = @_;
163
164    my $verbosity  = $self->{'verbosity'};
165    print STDERR "**** verbosity = $verbosity\n\n\n";
166
167    my $debug      = $self->{'debug'};
168    my $importdir  = $self->{'importdir'};
169    my $archivedir = $self->{'archivedir'};
170    my $out        = $self->{'out'};
171
172    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
173    if (!defined($collectcfg->{'infodbtype'}))
174    {
175      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
176    }
177
178    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
179    $importdir = $collectcfg->{'importdir'};
180    }
181    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
182    $archivedir = $collectcfg->{'archivedir'};
183    }
184    # fill in the default import and archives directories if none
185    # were supplied, turn all \ into / and remove trailing /
186    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
187    $importdir =~ s/[\\\/]+/\//g;
188    $importdir =~ s/\/$//;
189    if (!-e $importdir) {
190    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
191    die "\n";
192    }
193    $self->{'importdir'} = $importdir;
194
195    if ($archivedir eq "") {
196    if ($inexport_mode eq "import") {
197        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
198    }
199    elsif ($inexport_mode eq "export") {
200        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
201    }
202    else {
203        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
204        print STDERR "         Defaulting to 'archives' for file output\n";
205        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
206    }
207    }
208
209    $archivedir =~ s/[\\\/]+/\//g;
210    $archivedir =~ s/\/$//;
211    $self->{'archivedir'} = $archivedir;
212
213    if ($verbosity !~ /\d+/) {
214    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
215        $verbosity = $collectcfg->{'verbosity'};
216    } else {
217        $verbosity = 2; # the default
218    }
219    }
220    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
221    $self->{'manifest'} = $collectcfg->{'manifest'};
222    }
223
224    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
225    if ($collectcfg->{'gzip'} =~ /^true$/i) {
226        $self->{'gzip'} = 1;
227    }
228    }
229
230    if ($self->{'maxdocs'} !~ /\-?\d+/) {
231    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
232        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
233    } else {
234        $self->{'maxdocs'} = -1; # the default
235    }
236    }
237    if ($self->{'groupsize'} == 1) {
238    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
239        $self->{'groupsize'} = $collectcfg->{'groupsize'};
240    }
241    }
242
243    if (!defined $self->{'OIDtype'}
244    || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
245    if (defined $collectcfg->{'OIDtype'}
246        && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
247        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
248    } else {
249        $self->{'OIDtype'} = "hash"; # the default
250    }
251    }
252
253    if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
254    if (defined $collectcfg->{'OIDmetadata'}) {
255        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
256    } else {
257        $self->{'OIDmetadata'} = "dc.Identifier"; # the default
258    }
259    }
260
261    my $sortmeta = $self->{'sortmeta'};
262    if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
263    $sortmeta = $collectcfg->{'sortmeta'};
264    }
265    # sortmeta cannot be used with group size
266    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
267    if (defined $sortmeta && $self->{'groupsize'} > 1) {
268    &gsprintf($out, "{import.cannot_sort}\n\n");
269    $sortmeta = undef;
270    }
271    $self->{'sortmeta'} = $sortmeta;
272
273    if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
274    $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
275    }
276   
277    if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
278    $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
279    }
280    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
281    $self->{'debug'} = 1;
282    }
283    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
284    $self->{'gli'} = 1;
285    }
286    $self->{'gli'} = 0 unless defined $self->{'gli'};
287       
288    # check keepold and removeold
289    my ($removeold, $keepold, $incremental, $incremental_mode)
290    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
291                           $self->{'incremental'}, "archives",
292                           $collectcfg);
293
294    $self->{'removeold'}        = $removeold;
295    $self->{'keepold'}          = $keepold;
296    $self->{'incremental'}      = $incremental;
297    $self->{'incremental_mode'} = $incremental_mode;
298}
299
300sub process_files
301{
302    my $self = shift @_;
303    my ($inexport_mode,$collectcfg) = @_;
304
305    my $verbosity   = $self->{'verbosity'};
306    my $debug       = $self->{'debug'};
307
308    my $importdir   = $self->{'importdir'};
309    my $archivedir  = $self->{'archivedir'};
310
311    my $incremental = $self->{'incremental'};
312    my $incremental_mode = $self->{'incremental_mode'};
313
314    my $removeold   = $self->{'removeold'};
315    my $keepold     = $self->{'keepold'};
316
317    my $saveas      = $self->{'saveas'};
318    my $OIDtype     = $self->{'OIDtype'};
319    my $OIDmetadata = $self->{'OIDmetadata'};
320
321    my $out         = $self->{'out'};
322    my $faillog     = $self->{'faillog'};
323
324    my $maxdocs     = $self->{'maxdocs'};
325    my $gzip        = $self->{'gzip'};
326    my $groupsize   = $self->{'groupsize'};
327    my $sortmeta    = $self->{'sortmeta'};
328
329    my $removeprefix = $self->{'removeprefix'};
330    my $removesuffix = $self->{'removesuffix'};
331
332    my $gli         = $self->{'gli'};
333
334    print STDERR "<Import>\n" if $gli;
335   
336    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
337    if ($self->{'manifest'} ne "") {   
338    my $manifest_filename = $self->{'manifest'};
339
340    if (!&util::filename_is_absolute($manifest_filename)) {
341        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
342    }
343
344    $self->{'manifest'} =~ s/[\\\/]+/\//g;
345    $self->{'manifest'} =~ s/\/$//;
346
347    $manifest_lookup->parse($manifest_filename);
348    }
349
350    my $manifest = $self->{'manifest'};
351
352    # load all the plugins
353    my $plugins = [];
354    if (defined $collectcfg->{'plugin'}) {
355    $plugins = $collectcfg->{'plugin'};
356    }
357
358    #some global options for the plugins
359    my @global_opts = ();
360
361
362    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $incremental_mode);
363    if (scalar(@$pluginfo) == 0) {
364    &gsprintf($out, "{import.no_plugins_loaded}\n");
365    die "\n";
366    }
367
368    # remove the old contents of the archives directory (and tmp directory) if needed
369    if ($removeold) {
370    if (-e $archivedir) {
371        &gsprintf($out, "{import.removing_archives}\n");
372        &util::rm_r ($archivedir);
373    }
374    my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
375    $tmpdir =~ s/[\\\/]+/\//g;
376    $tmpdir =~ s/\/$//;
377    if (-e $tmpdir) {
378        &gsprintf($out, "{import.removing_tmpdir}\n");
379        &util::rm_r ($tmpdir);
380    }
381    }
382
383    # create the archives dir if needed
384    &util::mk_all_dir($archivedir);
385
386    # read the archive information file
387##  my $arcinfo_doc_filename = &util::filename_cat ($archivedir, "archives.inf");
388
389    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
390    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
391    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
392
393    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
394    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
395                           
396    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
397    $archive_info->load_info ($arcinfo_doc_filename);
398
399    if ($manifest eq "") {
400    # Load in list of files in import folder from last import (if present)
401    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
402    }
403
404    ####Use Plugout####
405    my ($plugout);
406    if (defined $collectcfg->{'plugout'}) {
407    # If a plugout was specified in the collect.cfg file, assume it is sensible
408    # We can't check the name because it could be anything, if it is a custom plugout
409    $plugout = $collectcfg->{'plugout'};
410    }
411    else{
412    if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
413        push @$plugout,"GreenstoneXMLPlugout";
414    }
415    else{
416        push @$plugout,$saveas."Plugout";
417    }
418    }
419
420    push @$plugout,("-output_info",$archive_info) if (defined $archive_info);
421    push @$plugout,("-verbosity",$verbosity)      if (defined $verbosity);
422    push @$plugout,("-gzip_output")               if ($gzip);
423    push @$plugout,("-group_size",$groupsize)     if (defined $groupsize);
424    push @$plugout,("-output_handle",$out)        if (defined);
425    push @$plugout,("-debug")                     if ($debug);
426   
427    my $processor = &plugout::load_plugout($plugout);                       
428    $processor->setoutputdir ($archivedir);
429    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
430    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
431   
432    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
433   
434    if ($removeold) {
435        # occasionally, plugins may want to do something on remove old, eg pharos image indexing
436    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
437    }
438    if ($manifest eq "") {
439    # process the import directory
440    my $block_hash = {};
441    my $metadata = {};
442    # gobal blocking pass may set up some metadata
443    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
444
445
446    if ($incremental || $incremental_mode eq "onlyadd") {
447
448        prime_doc_oid_count($archivedir);
449
450
451        # Can now work out which files were new, already existed, and have
452        # been deleted
453       
454        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
455                   $archivedir,$verbosity,$incremental_mode);
456       
457        my @new_files = sort keys %{$block_hash->{'new_files'}};
458        if (scalar(@new_files>0)) {
459        print STDERR "New files and modified metadata files since last import:\n  ";
460        print STDERR join("\n  ",@new_files), "\n";
461        }
462
463        if ($incremental) {
464               # only look for deletions if we are truely incremental
465        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
466        # Filter out any in gsdl/tmp area
467        my @filtered_deleted_files = ();
468        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
469        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
470        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
471        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
472                 
473        foreach my $df (@deleted_files) {
474            next if ($df =~ m/^$gsdl_tmp_area/);
475            next if ($df =~ m/^$collect_tmp_area/);
476           
477            push(@filtered_deleted_files,$df);
478        }       
479       
480
481        @deleted_files = @filtered_deleted_files;
482       
483        if (scalar(@deleted_files)>0) {
484            print STDERR "Files deleted since last import:\n  ";
485            print STDERR join("\n  ",@deleted_files), "\n";
486       
487       
488            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
489           
490            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
491        }
492       
493        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
494       
495        if (scalar(@reindex_files)>0) {
496            print STDERR "Files to reindex since last import:\n  ";
497            print STDERR join("\n  ",@reindex_files), "\n";
498            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
499            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
500        }
501               
502        }
503       
504        # Play it safe, and run through the entire folder, only processing new or edited files
505        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
506
507    }
508    else {
509        &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
510    }
511
512    }
513    else
514    {
515    #
516    # 1. Process delete files first
517    #
518
519    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
520    my @full_deleted_files = ();
521
522    # ensure all filenames are absolute
523    foreach my $df (@deleted_files) {       
524        my $full_df =
525        (&util::filename_is_absolute($df))
526        ? $df
527        : &util::filename_cat($importdir,$df);
528
529        push(@full_deleted_files,$full_df);
530    }
531   
532    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
533    mark_docs_for_deletion($archive_info,{},
534                      \@full_deleted_files,
535                      $archivedir, $verbosity, "delete");
536
537
538    #
539    # 2. Now files for reindexing
540    #
541
542    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
543    my @full_reindex_files = ();
544
545    # ensure all filenames are absolute
546    foreach my $rf (@reindex_files) {       
547        my $full_rf =
548        (&util::filename_is_absolute($rf))
549        ? $rf
550        : &util::filename_cat($importdir,$rf);
551
552        push(@full_reindex_files,$full_rf);
553    }
554   
555    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
556    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
557
558    # And now ensure the new version of the file processed by appropriate
559    # plugin
560    foreach my $full_rf (@full_reindex_files) {
561        &plugin::read ($pluginfo, "", $full_rf, {}, {}, $processor, $maxdocs, 0, $gli);
562    }
563
564
565    #
566    # 3. Now finally any new files
567    #
568
569    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
570        &plugin::read ($pluginfo, $importdir, $file, {}, {}, $processor, $maxdocs, 0, $gli);
571    }
572
573
574    }
575
576    &plugin::end($pluginfo, $processor);
577
578    &plugin::deinit($pluginfo, $processor);
579
580    # Store the value of OIDCount (used in doc.pm) so it can be
581    # restored correctly to this value on an incremental build
582    store_doc_oid_count($archivedir);
583
584    # write out the archive information file
585    $processor->close_file_output() if $groupsize > 1;
586    $processor->close_group_output() if $processor->is_group();
587
588# The following 'if' statement is in the export.pl version of the script,
589# The reason for the 'if' statement is now given in export.pl
590# Unclear at this point if the same should be done here
591##    if (($saveas =~ m/^.*METS$/) || ($saveas eq "MARC")) {
592    # Not all export types need this (e.g. DSpace)
593
594    # should we still do this in debug mode??
595
596    # for backwards compatability with archvies.inf file
597    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
598    $archive_info->save_info($arcinfo_doc_filename);
599    }
600    else {
601    $archive_info->save_revinfo_db($arcinfo_src_filename);
602    }
603
604
605##    }
606
607    return $pluginfo;
608}
609
610
611sub generate_statistics
612{
613    my $self = shift @_;
614    my ($inexport_mode,$pluginfo) = @_;
615
616    my $statsfile = $self->{'statsfile'};
617    my $out       = $self->{'out'};
618    my $faillogname = $self->{'faillogname'};
619    my $gli       = $self->{'gli'};
620
621    # write out import stats
622    my $close_stats = 0;
623    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
624    if (open (STATS, ">$statsfile")) {
625        $statsfile = 'import::STATS';
626        $close_stats = 1;
627    } else {
628        &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
629        &gsprintf($out, "{import.stats_backup}\n");
630        $statsfile = 'STDERR';
631    }
632    }
633
634    &gsprintf($out, "\n");
635    &gsprintf($out, "*********************************************\n");
636    &gsprintf($out, "{import.complete}\n");
637    &gsprintf($out, "*********************************************\n");
638
639    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
640    if ($close_stats) {
641    close STATS;
642    }
643
644    close OUT if $self->{'close_out'};
645    close FAILLOG;
646}
647
648
649
650
651
652
653
654sub oid_count_file {
655    my ($archivedir) = @_;
656    return &util::filename_cat ($archivedir, "OIDcount");
657}
658
659
660sub prime_doc_oid_count
661{
662    my ($archivedir) = @_;
663    my $oid_count_filename = &oid_count_file($archivedir);
664
665    if (-e $oid_count_filename) {
666    if (open(OIDIN,"<$oid_count_filename")) {
667        my $OIDcount = <OIDIN>;
668        chomp $OIDcount;       
669        close(OIDIN);
670
671        $doc::OIDcount = $OIDcount;     
672    }
673    else {
674       
675        print STDERR "Warning: unable to read document OID count from $oid_count_filename\n";
676        print STDERR "Setting value to 0\n";
677    }
678    }
679   
680}
681
682sub store_doc_oid_count
683{
684    # Use the file "OIDcount" in the archives directory to record
685    # what value doc.pm got up to
686
687    my ($archivedir) = @_;
688    my $oid_count_filename = &oid_count_file($archivedir);
689
690
691    if (open(OIDOUT,">$oid_count_filename")) {
692    print OIDOUT $doc::OIDcount, "\n";
693       
694    close(OIDOUT);
695    }
696    else {
697    print STDERR "Warning: unable to store document OID count\n";
698    }
699}
700
701
702
703sub new_vs_old_import_diff
704{
705    my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
706
707    # Get the infodbtype value for this collection from the arcinfo object
708    my $infodbtype = $archive_info->{'infodbtype'};
709
710    # in this method, we want to know if metadata files are modified or not.
711    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
712
713    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
714
715    # First convert all files to absolute form
716    # This is to support the situation where the import folder is not
717    # the default
718   
719    my $prev_all_files = $archive_info->{'prev_import_filelist'};
720    my $full_prev_all_files = {};
721
722    foreach my $prev_file (keys %$prev_all_files) {
723
724    if (!&util::filename_is_absolute($prev_file)) {
725        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
726        $full_prev_all_files->{$full_prev_file} = $prev_file;
727    }
728    else {
729        $full_prev_all_files->{$prev_file} = $prev_file;
730    }
731    }
732
733
734    # Figure out which are the new files, existing files and so
735    # by implication the files from the previous import that are not
736    # there any more => mark them for deletion
737    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
738   
739    my $full_curr_file = $curr_file;
740
741    # entry in 'all_files' is moved to either 'existing_files',
742    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
743
744    if (!&util::filename_is_absolute($curr_file)) {
745        # add in import dir to make absolute
746        $full_curr_file = &util::filename_cat($importdir,$curr_file);
747    }
748
749    # figure out if new file or not
750    if (defined $full_prev_all_files->{$full_curr_file}) {
751        # delete it so that only files that need deleting are left
752        delete $full_prev_all_files->{$full_curr_file};
753       
754        # had it before. is it a metadata file?
755        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
756       
757        # is it modified??
758        if (-M $full_curr_file < $archiveinf_timestamp) {
759            print STDERR "*** Detected a modified metadata file: $full_curr_file\n" if $verbosity > 2;
760            # its newer than last build
761            $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
762        }
763        }
764        else {
765        if ($incremental_mode eq "all") {
766           
767            # had it before
768            $block_hash->{'existing_files'}->{$full_curr_file} = 1;
769           
770        }
771        else {
772            # Warning in "onlyadd" mode, but had it before!
773            print STDERR "Warning: File $full_curr_file previously imported.\n";
774            print STDERR "         Treating as new file\n";
775           
776            $block_hash->{'new_files'}->{$full_curr_file} = 1;
777           
778        }
779        }
780    }
781    else {
782        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
783        # the new file is the special sort of file greenstone uses
784        # to attach metadata to src documents
785        # i.e metadata.xml
786        # (but note, the filename used is not constrained in
787        # Greenstone to always be this)
788
789        print STDERR "***** Detected new metadata file: $full_curr_file\n" if $verbosity > 2;
790        $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
791        }
792        else {
793        $block_hash->{'new_files'}->{$full_curr_file} = 1;
794        }
795    }
796
797   
798    delete $block_hash->{'all_files'}->{$curr_file};
799    }
800
801
802
803
804    # Deal with complication of new or modified metadata files by forcing
805    # everything from this point down in the file hierarchy to
806    # be freshly imported. 
807    #
808    # This may mean files that have not changed are reindexed, but does
809    # guarantee by the end of processing all new metadata is correctly
810    # associated with the relevant document(s).
811
812    foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
813    my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
814
815    $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
816    $situated_dir =~ s/\\/\\\\/g;  # need to protect windows slash \ in regular expression
817   
818    # Go through existing_files, and mark anything that is contained
819    # within 'situated_dir' to be reindexed (in case some of the metadata
820    # attaches to one of these files)
821
822    my $reindex_files = [];
823
824    foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
825   
826        if ($existing_f =~ m/^$situated_dir/) {
827        push(@$reindex_files,$existing_f);
828        $block_hash->{'reindex_files'}->{$existing_f} = 1;
829        delete $block_hash->{'existing_files'}->{$existing_f};
830
831        }
832    }
833   
834    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
835    # (or equivalent)
836    $block_hash->{'new_files'}->{$new_mdf} = 1;
837
838    }
839
840    # go through remaining existing files and work out what has changed and needs to be reindexed.
841    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
842
843    my $reindex_files = [];
844
845    foreach my $existing_filename (@existing_files) {
846    if (-M $existing_filename < $archiveinf_timestamp) {
847        # file is newer than last build
848       
849        my $existing_file = $existing_filename;
850        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
851
852        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
853        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
854       
855        print STDERR "**** Reindexing existing file: $existing_file\n";
856
857        push(@$reindex_files,$existing_file);
858        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
859    }
860
861    }
862
863   
864    # By this point full_prev_all_files contains the files
865    # mentioned in archiveinf-src.db but are not in the 'import'
866    # folder (or whatever was specified through -importdir ...)
867
868    # This list can contain files that were created in the 'tmp' or
869    # 'cache' areas (such as screen-size and thumbnail images).
870    #
871    # In building the final list of files to delete, we test to see if
872    # it exists on the filesystem and if it does (unusual for a "normal"
873    # file in import, but possible in the case of 'tmp' files),
874    # supress it from going into the final list
875
876    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
877
878    my @deleted_files = values %$full_prev_all_files;
879    map { my $curr_file = $_;
880      my $full_curr_file = $curr_file;
881
882      if (!&util::filename_is_absolute($curr_file)) {
883          # add in import dir to make absolute
884
885          $full_curr_file = &util::filename_cat($collectdir,$curr_file);
886      }
887
888
889      if (!-e $full_curr_file) {
890          $block_hash->{'deleted_files'}->{$curr_file} = 1;
891      }
892      } @deleted_files;
893
894
895
896}
897
898
899# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
900# $mode is 'delete' or 'reindex'
901sub mark_docs_for_deletion
902{
903    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
904
905    my $mode_text = "deleted from index";
906    if ($mode eq "reindex") {
907    $mode_text = "reindexed";
908    }
909
910    # Get the infodbtype value for this collection from the arcinfo object
911    my $infodbtype = $archive_info->{'infodbtype'};
912
913    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
914    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
915
916
917    # record files marked for deletion in arcinfo
918    foreach my $file (@$deleted_files) {
919    # use 'archiveinf-src' info database file to look up all the OIDs
920    # that this file is used in (note in most cases, it's just one OID)
921   
922    my $src_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
923    my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
924    my $oids = $src_rec->{'oid'};
925    my $file_record_deleted = 0;
926
927    # delete the src record
928    my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
929    &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
930    &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
931
932
933    foreach my $oid (@$oids) {
934
935        # find the source doc (the primary file that becomes this oid)
936        my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
937        my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
938        my $doc_source_file = $doc_rec->{'src-file'}->[0];
939        if (!&util::filename_is_absolute($doc_source_file)) {
940        $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
941        }
942
943        if ($doc_source_file ne $file) {
944        # its an associated or metadata file
945       
946        # mark source doc for reimport as one of its assoc files has changed or deleted
947        $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
948       
949        }
950        my $curr_status = $archive_info->get_status_info($oid);
951        if (defined($curr_status) && (($curr_status ne "D"))) {
952        if ($verbosity>1) {
953            print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
954        }
955        # mark oid for deletion (it will be deleted or reimported)
956        $archive_info->set_status_info($oid,"D");
957        my $val = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
958        $val =~ s/^<index-status>(.*)$/<index-status>D/m;
959
960        my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
961        my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
962
963        &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
964        &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
965        }
966    }
967   
968    }
969    # now go through and check that we haven't marked any primary files for reindex (because their associated files have changed/deleted) when they have been deleted themselves.
970    foreach my $file (@$deleted_files) {
971    if (defined $block_hash->{'reindex_files'}->{$file}) {
972        delete $block_hash->{'reindex_files'}->{$file};
973    }
974    }
975
976
977}
978
979
980
9811;
Note: See TracBrowser for help on using the browser.