root/main/trunk/greenstone2/perllib/inexport.pm @ 26451

Revision 26451, 38.5 KB (checked in by ak19, 7 years ago)

1. Fixed processing of the collectionconfig's indexOption element. 2. Correct set of changes for processing the new toplevel importOptions and buildOptions elements of collectionConfig.xml (which can contain such options as specify OIDtype, OIDmetadata, verbosity). 3. Undoing previous commits, since the importOptions and buildOptions elememts are not nested inside plugins but are one of the toplevel elements of collectionConfig.xml. And do not need the recently-committed changes to inexport.pm either, since any command line args for import and buildOptions will override what's in collectionConfig.xml anyway.

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52    my $class = shift (@_);
53    my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55    my $self = { 'xml' => 0, 'mode' => $mode };
56
57    # general options available to all plugins
58    my $arguments = $options->{'args'};
59    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60    # Parse returns -1 if something has gone wrong
61    if ($intArgLeftinAfterParsing == -1)
62    {
63    &PrintUsage::print_txt_usage($options, "{import.params}");
64    die "\n";
65    }
66   
67    my $language = $self->{'language'};
68    # If $language has been specified, load the appropriate resource bundle
69    # (Otherwise, the default resource bundle will be loaded automatically)
70    if ($language && $language =~ /\S/) {
71    &gsprintf::load_language_specific_resource_bundle($language);
72    }
73
74    if ($self->{'listall'}) {
75    if ($self->{'xml'}) {
76        &PrintUsage::print_xml_usage($opt_listall_options);
77    }
78    else
79    {
80        &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81    }
82    die "\n";
83    }
84
85
86    if ($self->{'xml'}) {
87        &PrintUsage::print_xml_usage($options);
88    print "\n";
89    return bless $self, $class;
90    }
91
92    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93    &gsprintf::output_strings_in_UTF8;
94    }
95   
96    # now check that we had exactly one leftover arg, which should be
97    # the collection name. We don't want to do this earlier, cos
98    # -xml arg doesn't need a collection name
99    # Or if the user specified -h, then we output the usage also
100
101    if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102    {
103    &PrintUsage::print_txt_usage($options, "{import.params}");
104    die "\n";
105    }
106
107    $self->{'close_out'} = 0;
108    my $out = $self->{'out'};
109    if ($out !~ /^(STDERR|STDOUT)$/i) {
110    open (OUT, ">$out") ||
111        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112    $out = 'inexport::OUT';
113    $self->{'close_out'} = 1;
114    }
115    $out->autoflush(1);
116    $self->{'out'} = $out;
117
118    # @ARGV should be only one item, the name of the collection
119    $self->{'collection'} = shift @$argv;
120
121    if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122    require ParallelInexport;
123    }
124
125    return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131    my $class = shift (@_);
132    my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134    my $self = { 'xml' => 0, 'mode' => $mode };
135
136    $self->{'out'} = STDERR;
137   
138    if (defined $gsdl_cgi) {
139        $self->{'site'} = $opt_site;
140        my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141        $self->{'collectdir'} = $collect_dir;
142    }
143    else { 
144        $self->{'site'} = "";
145        $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146    }
147    $self->{'faillog'} = "";
148   
149    $self->{'collection'} = $collect;
150
151    return bless $self, $class;
152}
153sub get_collection
154{
155    my $self = shift @_;
156   
157    return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163    my $self = shift @_;
164    my ($collection,$options) = @_;
165
166    my $collectdir = $self->{'collectdir'};
167    my $site       = $self->{'site'};
168    my $out        = $self->{'out'};
169     
170    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171    &PrintUsage::print_txt_usage($options, "{import.params}");
172    die "\n";
173    }
174
175    # set gs_verison 2/3
176    $self->{'gs_version'} = "2";
177    if ((defined $site) && ($site ne "")) {
178    # gs3
179    $self->{'gs_version'} = "3";
180    }
181    # add collection's perllib dir  into include path in
182    # case we have collection specific modules
183    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
184
185    # check that we can open the faillog
186    my $faillog = $self->{'faillog'};
187    if ($faillog eq "") {
188    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
189    }
190    open (FAILLOG, ">$faillog") ||
191    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
192
193   
194    my $faillogname = $faillog;
195    $faillog = 'inexport::FAILLOG';
196    $faillog->autoflush(1);
197    $self->{'faillog'} = $faillog;
198    $self->{'faillogname'} = $faillogname;
199
200    # Read in the collection configuration file.
201    my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
202    my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
203
204    return ($config_filename,$collectcfg);
205}
206
207sub set_collection_options
208{
209    my $self = shift @_;
210    my ($collectcfg) = @_;
211
212    my $inexport_mode = $self->{'mode'};
213
214    my $verbosity  = $self->{'verbosity'};
215    my $debug      = $self->{'debug'};
216    my $importdir  = $self->{'importdir'};
217    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
218    my $out        = $self->{'out'};
219
220    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
221    if (!defined($collectcfg->{'infodbtype'}))
222    {
223      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
224    }
225    if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
226    # we can't use the text version for archives dbs.
227    $collectcfg->{'infodbtype'} = "gdbm";
228    }
229
230    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
231    $importdir = $collectcfg->{'importdir'};
232    }
233    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
234    $archivedir = $collectcfg->{'archivedir'};
235    }
236    # fill in the default import and archives directories if none
237    # were supplied, turn all \ into / and remove trailing /
238    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
239    $importdir =~ s/[\\\/]+/\//g;
240    $importdir =~ s/\/$//;
241    if (!-e $importdir) {
242    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
243    die "\n";
244    }
245    $self->{'importdir'} = $importdir;
246
247    if ($archivedir eq "") {
248    if ($inexport_mode eq "import") {
249        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
250    }
251    elsif ($inexport_mode eq "export") {
252        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
253    }
254    else {
255        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
256        print STDERR "         Defaulting to 'archives' for file output\n";
257        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
258    }
259    }
260
261    $archivedir =~ s/[\\\/]+/\//g;
262    $archivedir =~ s/\/$//;
263    $self->{'archivedir'} = $archivedir;
264
265    if ($verbosity !~ /\d+/) {
266    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
267        $verbosity = $collectcfg->{'verbosity'};
268    } else {
269        $verbosity = 2; # the default
270    }
271    }
272    $self->{'verbosity'} = $verbosity;
273
274    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
275    $self->{'manifest'} = $collectcfg->{'manifest'};
276    }
277
278    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
279    if ($collectcfg->{'gzip'} =~ /^true$/i) {
280        $self->{'gzip'} = 1;
281    }
282    }
283
284    if ($self->{'maxdocs'} !~ /\-?\d+/) {
285    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
286        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
287    } else {
288        $self->{'maxdocs'} = -1; # the default
289    }
290    }
291
292    if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
293    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
294        $self->{'groupsize'} = $collectcfg->{'groupsize'};
295    }
296    }
297
298    if (!defined $self->{'OIDtype'}
299    || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
300    if (defined $collectcfg->{'OIDtype'}
301        && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
302        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
303    } else {
304        $self->{'OIDtype'} = "hash"; # the default
305    }
306    }
307
308    if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
309    if (defined $collectcfg->{'OIDmetadata'}) {
310        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
311    } else {
312        $self->{'OIDmetadata'} = "dc.Identifier"; # the default
313    }
314    }
315
316    my $sortmeta = $self->{'sortmeta'};
317    if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
318    $sortmeta = $collectcfg->{'sortmeta'};
319    }
320    # sortmeta cannot be used with group size
321    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
322    if (defined $sortmeta && $self->{'groupsize'} > 1) {
323    &gsprintf($out, "{import.cannot_sort}\n\n");
324    $sortmeta = undef;
325    }
326    $self->{'sortmeta'} = $sortmeta;
327
328    if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
329    $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
330    }
331   
332    if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
333    $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
334    }
335    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
336    $self->{'debug'} = 1;
337    }
338    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
339    $self->{'gli'} = 1;
340    }
341    $self->{'gli'} = 0 unless defined $self->{'gli'};
342       
343    # check keepold and removeold
344    my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
345
346    my ($removeold, $keepold, $incremental, $incremental_mode)
347    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
348                           $self->{'incremental'}, $checkdir,
349                           $collectcfg);
350
351    $self->{'removeold'}        = $removeold;
352    $self->{'keepold'}          = $keepold;
353    $self->{'incremental'}      = $incremental;
354    $self->{'incremental_mode'} = $incremental_mode;
355}
356
357sub process_files
358{
359    my $self = shift @_;
360    my ($config_filename,$collectcfg) = @_;
361
362    my $inexport_mode = $self->{'mode'};
363
364    my $verbosity   = $self->{'verbosity'};
365    my $debug       = $self->{'debug'};
366
367    my $importdir   = $self->{'importdir'};
368    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
369
370    my $incremental = $self->{'incremental'};
371    my $incremental_mode = $self->{'incremental_mode'};
372
373    my $gs_version = $self->{'gs_version'};
374
375    my $removeold   = $self->{'removeold'};
376    my $keepold     = $self->{'keepold'};
377
378    my $saveas      = $self->{'saveas'};
379    my $OIDtype     = $self->{'OIDtype'};
380    my $OIDmetadata = $self->{'OIDmetadata'};
381
382    my $out         = $self->{'out'};
383    my $faillog     = $self->{'faillog'};
384
385    my $maxdocs     = $self->{'maxdocs'};
386    my $gzip        = $self->{'gzip'};
387    my $groupsize   = $self->{'groupsize'};
388    my $sortmeta    = $self->{'sortmeta'};
389
390    my $removeprefix = $self->{'removeprefix'};
391    my $removesuffix = $self->{'removesuffix'};
392
393    my $gli          = $self->{'gli'};
394
395    my $jobs         = $self->{'jobs'};
396    my $epoch        = $self->{'epoch'};
397
398    # related to export
399    my $xsltfile         = $self->{'xsltfile'};
400    my $group_marc       = $self->{'group_marc'};
401    my $mapping_file     = $self->{'mapping_file'};
402    my $xslt_mets        = $self->{'xslt_mets'};
403    my $xslt_txt         = $self->{'xslt_txt'};
404    my $fedora_namespace = $self->{'fedora_namespace'};
405    my $metadata_prefix  = $self->{'metadata_prefix'};
406
407    if ($inexport_mode eq "import") {
408    print STDERR "<Import>\n" if $gli;
409    }
410    else {
411    print STDERR "<export>\n" if $gli;
412    }
413
414    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
415    if ($self->{'manifest'} ne "") {
416    my $manifest_filename = $self->{'manifest'};
417
418    if (!&util::filename_is_absolute($manifest_filename)) {
419        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
420    }
421
422    $self->{'manifest'} =~ s/[\\\/]+/\//g;
423    $self->{'manifest'} =~ s/\/$//;
424
425    $manifest_lookup->parse($manifest_filename);
426    }
427
428    my $manifest = $self->{'manifest'};
429
430    # load all the plugins
431    my $plugins = [];
432    if (defined $collectcfg->{'plugin'}) {
433    $plugins = $collectcfg->{'plugin'};
434    }
435
436    my $plugin_incr_mode = $incremental_mode;
437    if ($manifest ne "") {
438    # if we have a manifest file, then we pretend we are fully incremental for plugins
439    $plugin_incr_mode = "all";
440    }
441    #some global options for the plugins
442    my @global_opts = ();
443
444    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode, $gs_version);
445    if (scalar(@$pluginfo) == 0) {
446    &gsprintf($out, "{import.no_plugins_loaded}\n");
447    die "\n";
448    }
449
450    # remove the old contents of the archives directory (and tmp
451    # directory) if needed
452
453    if ($removeold) {
454    if (-e $archivedir) {
455        &gsprintf($out, "{import.removing_archives}\n");
456        &util::rm_r ($archivedir);
457    }
458    my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
459    $tmpdir =~ s/[\\\/]+/\//g;
460    $tmpdir =~ s/\/$//;
461    if (-e $tmpdir) {
462        &gsprintf($out, "{import.removing_tmpdir}\n");
463        &util::rm_r ($tmpdir);
464    }
465    }
466
467    # create the archives dir if needed
468    &util::mk_all_dir($archivedir);
469
470    # read the archive information file
471
472    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
473    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
474    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
475
476    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
477    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
478                           
479    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
480    $archive_info->load_info ($arcinfo_doc_filename);
481
482    if ($manifest eq "") {
483    # Load in list of files in import folder from last import (if present)
484    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
485    }
486
487    ####Use Plugout####
488    my $plugout;
489
490    if ($inexport_mode eq "import") {
491    if (defined $collectcfg->{'plugout'}) {
492        # If a plugout was specified in the collect.cfg file, assume it is sensible
493        # We can't check the name because it could be anything, if it is a custom plugout
494        $plugout = $collectcfg->{'plugout'};
495    }
496    else{
497        if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
498        push @$plugout,"GreenstoneXMLPlugout";
499        }
500        else{
501        push @$plugout,$saveas."Plugout";
502        }
503    }
504    }
505    else {
506    if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
507        $plugout = $collectcfg->{'plugout'};
508    }
509    else{
510        if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
511        push @$plugout,"GreenstoneMETSPlugout";
512        }
513        else{
514        push @$plugout,$saveas."Plugout";
515        }
516    }
517    }
518   
519    my $plugout_name = $plugout->[0];
520
521    push @$plugout,("-output_info",$archive_info)  if (defined $archive_info);
522    push @$plugout,("-verbosity",$verbosity)       if (defined $verbosity);
523    push @$plugout,("-debug")                      if ($debug);
524    push @$plugout,("-group_size",$groupsize)      if (defined $groupsize);
525    push @$plugout,("-gzip_output")                if ($gzip);
526    push @$plugout,("-output_handle",$out)         if (defined $out);
527
528    push @$plugout,("-xslt_file",$xsltfile)        if (defined $xsltfile && $xsltfile ne "");
529
530    if ($plugout_name =~ m/^MARCXMLPlugout$/) {
531    push @$plugout,("-group")                      if ($group_marc);
532    push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
533    }
534    if ($plugout_name =~ m/^.*METSPlugout$/) {
535    push @$plugout,("-xslt_mets",$xslt_mets)       if (defined $xslt_mets && $xslt_mets ne "");
536    push @$plugout,("-xslt_txt",$xslt_txt)         if (defined $xslt_txt && $xslt_txt ne "");
537    }
538
539    if ($plugout_name eq "FedoraMETSPlugout") {
540    push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
541    }
542   
543    if ($plugout_name eq "DSpacePlugout") {
544    push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");   
545    }
546
547    my $processor = &plugout::load_plugout($plugout);                       
548    $processor->setoutputdir ($archivedir);
549    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
550    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
551   
552    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
553   
554    if ($removeold) {
555        # occasionally, plugins may want to do something on remove
556        # old, eg pharos image indexing
557    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
558    }
559
560    # process the import directory
561    my $block_hash = {};
562    $block_hash->{'new_files'} = {};
563    $block_hash->{'reindex_files'} = {};
564    my $metadata = {};
565   
566    # global blocking pass may set up some metadata
567    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
568   
569    if ($manifest ne "") {
570    #
571    # 1. Process delete files first
572    #
573    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
574    my @full_deleted_files = ();
575
576    # ensure all filenames are absolute
577    foreach my $df (@deleted_files) {
578        my $full_df =
579        (&util::filename_is_absolute($df))
580        ? $df
581        : &util::filename_cat($importdir,$df);
582
583        if (-d $full_df) {
584        &add_dir_contents_to_list($full_df, \@full_deleted_files);
585        } else {
586        push(@full_deleted_files,$full_df);
587        }
588    }
589   
590    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
591    mark_docs_for_deletion($archive_info,{},
592                   \@full_deleted_files,
593                   $archivedir, $verbosity, "delete");
594
595
596    #
597    # 2. Now files for reindexing
598    #
599
600    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
601    my @full_reindex_files = ();
602    # ensure all filenames are absolute
603    foreach my $rf (@reindex_files) {       
604        my $full_rf =
605        (&util::filename_is_absolute($rf))
606        ? $rf
607        : &util::filename_cat($importdir,$rf);
608
609        if (-d $full_rf) {
610        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
611        } else {
612        push(@full_reindex_files,$full_rf);
613        }
614    }
615   
616    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
617    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
618
619    # And now to ensure the new version of the file processed by
620    # appropriate plugin, we need to add it to block_hash reindex list
621    foreach my $full_rf (@full_reindex_files) {
622        $block_hash->{'reindex_files'}->{$full_rf} = 1;
623    }
624
625
626    #
627    # 3. Now finally any new files - add to block_hash new_files list
628    #
629
630    my @new_files = keys %{$manifest_lookup->{'index'}};
631    my @full_new_files = ();
632
633    foreach my $nf (@new_files) {
634        # ensure filename is absolute
635        my $full_nf =
636        (&util::filename_is_absolute($nf))
637        ? $nf
638        : &util::filename_cat($importdir,$nf);
639
640        if (-d $full_nf) {
641        &add_dir_contents_to_list($full_nf, \@full_new_files);
642        } else {
643        push(@full_new_files,$full_nf);
644        }
645    }
646
647    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
648    my $arcinfodb_map = {};
649    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
650    foreach my $f (@full_new_files) {
651        # check that we haven't seen it already
652        if (defined $arcinfodb_map->{$f}) {
653        # TODO make better warning
654        print STDERR "Warning: $f already in src archive, \n";
655        } else {
656        $block_hash->{'new_files'}->{$f} = 1;
657        }
658    }
659
660    undef $arcinfodb_map;
661    }
662    else {
663    # if incremental, we read through the import folder to see whats changed.
664
665    if ($incremental || $incremental_mode eq "onlyadd") {
666        prime_doc_oid_count($archivedir);
667
668        # Can now work out which files were new, already existed, and have
669        # been deleted
670       
671        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
672                   $archivedir,$verbosity,$incremental_mode);
673       
674        my @new_files = sort keys %{$block_hash->{'new_files'}};
675        if (scalar(@new_files>0)) {
676        print STDERR "New files and modified metadata files since last import:\n  ";
677        print STDERR join("\n  ",@new_files), "\n";
678        }
679
680        if ($incremental) {
681               # only look for deletions if we are truely incremental
682        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
683        # Filter out any in gsdl/tmp area
684        my @filtered_deleted_files = ();
685        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
686        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
687        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
688        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
689                 
690        foreach my $df (@deleted_files) {
691            next if ($df =~ m/^$gsdl_tmp_area/);
692            next if ($df =~ m/^$collect_tmp_area/);
693           
694            push(@filtered_deleted_files,$df);
695        }       
696       
697
698        @deleted_files = @filtered_deleted_files;
699       
700        if (scalar(@deleted_files)>0) {
701            print STDERR "Files deleted since last import:\n  ";
702            print STDERR join("\n  ",@deleted_files), "\n";
703       
704       
705            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
706           
707            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
708        }
709       
710        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
711       
712        if (scalar(@reindex_files)>0) {
713            print STDERR "Files to reindex since last import:\n  ";
714            print STDERR join("\n  ",@reindex_files), "\n";
715            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
716            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
717        }
718               
719        }       
720    }
721    }
722
723    # Check for existence of the file that's to contain earliestDateStamp in archivesdir
724    # Do nothing if the file already exists (file exists on incremental build).
725    # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
726    # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
727    # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
728    # oailastmodified and oailastmodifieddate
729    my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
730    if (!-f $earliestDatestampFile && -d $archivedir) {
731    my $current_time_in_seconds = time; # in seconds
732
733    if(open(FOUT, ">$earliestDatestampFile")) {
734        # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
735        print FOUT $current_time_in_seconds;
736        close(FOUT);
737    }
738    else {
739        &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
740    }
741
742    }
743
744    # now, whichever mode we are in, we can process the entire import folder
745    if ((defined $jobs) && ($jobs > 1))
746    {
747    # if jobs are set to >1, run in parallel using MPI helper
748    # [hs, 1 july 2010]
749    &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
750                          $self->{'collection'}, $self->{'site'});
751    }
752    else
753    {
754    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
755    }
756   
757   
758    if ($saveas eq "FedoraMETS") {
759    # create collection "doc obj" for Fedora that contains
760    # collection-level metadata
761   
762    my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
763    $doc_obj->set_OID("collection");
764   
765    my $col_name = undef;
766    my $col_meta = $collectcfg->{'collectionmeta'};
767   
768    if (defined $col_meta) {       
769        store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
770        store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description     
771    }
772    $processor->process($doc_obj);
773    }
774
775    &plugin::end($pluginfo, $processor);
776
777    &plugin::deinit($pluginfo, $processor);
778
779    # Store the value of OIDCount (used in doc.pm) so it can be
780    # restored correctly to this value on an incremental build
781    store_doc_oid_count($archivedir);
782
783    # write out the archive information file
784    $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
785    $processor->close_group_output() if $processor->is_group();
786
787    # for backwards compatability with archvies.inf file
788    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
789    $archive_info->save_info($arcinfo_doc_filename);
790    }
791    else {
792    $archive_info->save_revinfo_db($arcinfo_src_filename);
793    }
794
795    return $pluginfo;
796}
797
798
799sub generate_statistics
800{
801    my $self = shift @_;
802    my ($pluginfo) = @_;
803
804    my $inexport_mode = $self->{'mode'};
805
806    my $statsfile   = $self->{'statsfile'};
807    my $out         = $self->{'out'};
808    my $faillogname = $self->{'faillogname'};
809    my $gli         = $self->{'gli'};
810    my $jobs        = $self->{'jobs'};
811
812    # write out import stats
813
814    if ((!defined $jobs) || ($jobs == 1))
815    {
816    # only output statistics if there are multiple jobs
817    # [hs, 1 july 2010]
818
819    my $close_stats = 0;
820    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
821        if (open (STATS, ">$statsfile")) {
822        $statsfile = 'inexport::STATS';
823        $close_stats = 1;
824        } else {
825        &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
826        &gsprintf($out, "{import.stats_backup}\n");
827        $statsfile = 'STDERR';
828        }
829    }
830   
831    &gsprintf($out, "\n");
832    &gsprintf($out, "*********************************************\n");
833    &gsprintf($out, "{$inexport_mode.complete}\n");
834    &gsprintf($out, "*********************************************\n");
835   
836    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
837    if ($close_stats) {
838        close STATS;
839    }
840    }
841
842    close OUT if $self->{'close_out'};
843    close FAILLOG;
844}
845
846
847sub store_collectionmeta
848{
849    my ($collectionmeta,$field,$doc_obj) = @_;
850   
851    my $section = $doc_obj->get_top_section();
852   
853    my $field_hash = $collectionmeta->{$field};
854   
855    foreach my $k (keys %$field_hash)
856    {
857    my $val = $field_hash->{$k};
858   
859    ### print STDERR "*** $k = $field_hash->{$k}\n";
860   
861    my $md_label = "ex.$field";
862   
863   
864    if ($k =~ m/^\[l=(.*?)\]$/)
865    {
866       
867        my $md_suffix = $1;
868        $md_label .= "^$md_suffix";
869    }
870   
871   
872    $doc_obj->add_utf8_metadata($section,$md_label, $val);
873   
874    # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
875    # while "collectionname" in GS2 is called "name" in GS3.
876    # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
877    if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
878    {
879        $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
880    }
881   
882    }
883}
884
885
886sub oid_count_file {
887    my ($archivedir) = @_;
888    return &util::filename_cat ($archivedir, "OIDcount");
889}
890
891
892sub prime_doc_oid_count
893{
894    my ($archivedir) = @_;
895    my $oid_count_filename = &oid_count_file($archivedir);
896
897    if (-e $oid_count_filename) {
898    if (open(OIDIN,"<$oid_count_filename")) {
899        my $OIDcount = <OIDIN>;
900        chomp $OIDcount;       
901        close(OIDIN);
902
903        $doc::OIDcount = $OIDcount;     
904    }
905    else {     
906        &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
907    }
908    }
909   
910}
911
912sub store_doc_oid_count
913{
914    # Use the file "OIDcount" in the archives directory to record
915    # what value doc.pm got up to
916
917    my ($archivedir) = @_;
918    my $oid_count_filename = &oid_count_file($archivedir);
919
920
921    if (open(OIDOUT,">$oid_count_filename")) {
922    print OIDOUT $doc::OIDcount, "\n";
923       
924    close(OIDOUT);
925    }
926    else {
927    &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
928    }
929}
930
931
932
933sub new_vs_old_import_diff
934{
935    my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
936
937    # Get the infodbtype value for this collection from the arcinfo object
938    my $infodbtype = $archive_info->{'infodbtype'};
939
940    # in this method, we want to know if metadata files are modified or not.
941    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
942
943    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
944
945    # First convert all files to absolute form
946    # This is to support the situation where the import folder is not
947    # the default
948   
949    my $prev_all_files = $archive_info->{'prev_import_filelist'};
950    my $full_prev_all_files = {};
951
952    foreach my $prev_file (keys %$prev_all_files) {
953
954    if (!&util::filename_is_absolute($prev_file)) {
955        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
956        $full_prev_all_files->{$full_prev_file} = $prev_file;
957    }
958    else {
959        $full_prev_all_files->{$prev_file} = $prev_file;
960    }
961    }
962
963
964    # Figure out which are the new files, existing files and so
965    # by implication the files from the previous import that are not
966    # there any more => mark them for deletion
967    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
968   
969    my $full_curr_file = $curr_file;
970
971    # entry in 'all_files' is moved to either 'existing_files',
972    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
973
974    if (!&util::filename_is_absolute($curr_file)) {
975        # add in import dir to make absolute
976        $full_curr_file = &util::filename_cat($importdir,$curr_file);
977    }
978
979    # figure out if new file or not
980    if (defined $full_prev_all_files->{$full_curr_file}) {
981        # delete it so that only files that need deleting are left
982        delete $full_prev_all_files->{$full_curr_file};
983       
984        # had it before. is it a metadata file?
985        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
986       
987        # is it modified??
988        if (-M $full_curr_file < $archiveinf_timestamp) {
989            print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
990            # its newer than last build
991            $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
992        }
993        }
994        else {
995        if ($incremental_mode eq "all") {
996           
997            # had it before
998            $block_hash->{'existing_files'}->{$full_curr_file} = 1;
999           
1000        }
1001        else {
1002            # Warning in "onlyadd" mode, but had it before!
1003            print STDERR "Warning: File $full_curr_file previously imported.\n";
1004            print STDERR "         Treating as new file\n";
1005           
1006            $block_hash->{'new_files'}->{$full_curr_file} = 1;
1007           
1008        }
1009        }
1010    }
1011    else {
1012        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1013        # the new file is the special sort of file greenstone uses
1014        # to attach metadata to src documents
1015        # i.e metadata.xml
1016        # (but note, the filename used is not constrained in
1017        # Greenstone to always be this)
1018
1019        print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1020        $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1021        }
1022        else {
1023        $block_hash->{'new_files'}->{$full_curr_file} = 1;
1024        }
1025    }
1026
1027   
1028    delete $block_hash->{'all_files'}->{$curr_file};
1029    }
1030
1031
1032
1033
1034    # Deal with complication of new or modified metadata files by forcing
1035    # everything from this point down in the file hierarchy to
1036    # be freshly imported. 
1037    #
1038    # This may mean files that have not changed are reindexed, but does
1039    # guarantee by the end of processing all new metadata is correctly
1040    # associated with the relevant document(s).
1041
1042    foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1043    my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1044
1045    $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1046    $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1047   
1048    # Go through existing_files, and mark anything that is contained
1049    # within 'situated_dir' to be reindexed (in case some of the metadata
1050    # attaches to one of these files)
1051
1052    my $reindex_files = [];
1053
1054    foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1055   
1056        if ($existing_f =~ m/^$situated_dir/) {
1057
1058        print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1059
1060        push(@$reindex_files,$existing_f);
1061        $block_hash->{'reindex_files'}->{$existing_f} = 1;
1062        delete $block_hash->{'existing_files'}->{$existing_f};
1063
1064        }
1065    }
1066   
1067    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1068    # (or equivalent)
1069    $block_hash->{'new_files'}->{$new_mdf} = 1;
1070
1071    }
1072
1073    # go through remaining existing files and work out what has changed and needs to be reindexed.
1074    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1075
1076    my $reindex_files = [];
1077
1078    foreach my $existing_filename (@existing_files) {
1079    if (-M $existing_filename < $archiveinf_timestamp) {
1080        # file is newer than last build
1081       
1082        my $existing_file = $existing_filename;
1083        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1084
1085        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1086        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1087       
1088        print STDERR "**** Reindexing existing file: $existing_file\n";
1089
1090        push(@$reindex_files,$existing_file);
1091        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1092    }
1093
1094    }
1095
1096   
1097    # By this point full_prev_all_files contains the files
1098    # mentioned in archiveinf-src.db but are not in the 'import'
1099    # folder (or whatever was specified through -importdir ...)
1100
1101    # This list can contain files that were created in the 'tmp' or
1102    # 'cache' areas (such as screen-size and thumbnail images).
1103    #
1104    # In building the final list of files to delete, we test to see if
1105    # it exists on the filesystem and if it does (unusual for a "normal"
1106    # file in import, but possible in the case of 'tmp' files),
1107    # supress it from going into the final list
1108
1109    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1110
1111    my @deleted_files = values %$full_prev_all_files;
1112    map { my $curr_file = $_;
1113      my $full_curr_file = $curr_file;
1114
1115      if (!&util::filename_is_absolute($curr_file)) {
1116          # add in import dir to make absolute
1117
1118          $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1119      }
1120
1121
1122      if (!-e $full_curr_file) {
1123          $block_hash->{'deleted_files'}->{$curr_file} = 1;
1124      }
1125      } @deleted_files;
1126
1127
1128
1129}
1130
1131
1132# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1133# $mode is 'delete' or 'reindex'
1134sub mark_docs_for_deletion
1135{
1136    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1137
1138    my $mode_text = "deleted from index";
1139    if ($mode eq "reindex") {
1140    $mode_text = "reindexed";
1141    }
1142
1143    # Get the infodbtype value for this collection from the arcinfo object
1144    my $infodbtype = $archive_info->{'infodbtype'};
1145
1146    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1147    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1148
1149
1150    # record files marked for deletion in arcinfo
1151    foreach my $file (@$deleted_files) {
1152    # use 'archiveinf-src' info database file to look up all the OIDs
1153    # that this file is used in (note in most cases, it's just one OID)
1154   
1155    my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1156    my $oids = $src_rec->{'oid'};
1157    my $file_record_deleted = 0;
1158
1159    # delete the src record
1160    my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1161    &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1162    &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1163
1164
1165    foreach my $oid (@$oids) {
1166
1167        # find the source doc (the primary file that becomes this oid)
1168        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1169        my $doc_source_file = $doc_rec->{'src-file'}->[0];
1170        if (!&util::filename_is_absolute($doc_source_file)) {
1171        $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1172        }
1173
1174        if ($doc_source_file ne $file) {
1175        # its an associated or metadata file
1176       
1177        # mark source doc for reimport as one of its assoc files has changed or deleted
1178        $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1179       
1180        }
1181        my $curr_status = $archive_info->get_status_info($oid);
1182        if (defined($curr_status) && (($curr_status ne "D"))) {
1183        if ($verbosity>1) {
1184            print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1185        }
1186        # mark oid for deletion (it will be deleted or reimported)
1187        $archive_info->set_status_info($oid,"D");
1188        my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1189        $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1190
1191        my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1192        my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1193
1194        &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1195        &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1196        }
1197    }
1198   
1199    }
1200
1201    # now go through and check that we haven't marked any primary
1202    # files for reindex (because their associated files have
1203    # changed/deleted) when they have been deleted themselves. only in
1204    # delete mode.
1205
1206    if ($mode eq "delete") {
1207    foreach my $file (@$deleted_files) {
1208        if (defined $block_hash->{'reindex_files'}->{$file}) {
1209        delete $block_hash->{'reindex_files'}->{$file};
1210        }
1211    }
1212    }
1213
1214
1215}
1216
1217sub add_dir_contents_to_list {
1218
1219    my ($dirname, $list) = @_;
1220 
1221    # Recur over directory contents.
1222    my (@dir, $subfile);
1223   
1224    # find all the files in the directory
1225    if (!opendir (DIR, $dirname)) {
1226    print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1227    return -1; # error in processing
1228    }
1229    @dir = readdir (DIR);
1230    closedir (DIR);
1231   
1232    for (my $i = 0; $i < scalar(@dir); $i++) {
1233    my $subfile = $dir[$i];
1234    next if ($subfile =~ m/^\.\.?$/);
1235    next if ($subfile =~ /^\.svn$/);
1236    my $full_file = &util::filename_cat($dirname, $subfile);
1237    if (-d $full_file) {
1238        &add_dir_contents_to_list($full_file, $list);
1239    } else {
1240        push (@$list, $full_file);
1241    }
1242    }
1243   
1244}
1245
1246
12471;
Note: See TracBrowser for help on using the browser.