root/main/trunk/greenstone2/perllib/inexport.pm @ 24829

Revision 24829, 38.3 KB (checked in by ak19, 8 years ago)

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# inexport.pm -- useful class to support import.pl and export.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package inexport;
27
28use strict;
29
30no strict 'refs'; # allow filehandles to be variables and vice versa
31no strict 'subs'; # allow barewords (eg STDERR) as function arguments
32
33use arcinfo;
34use colcfg;
35use dbutil;
36use doc;
37use plugin;
38use plugout;
39use manifest;
40use inexport;
41use util;
42use scriptutil;
43use FileHandle;
44use gsprintf 'gsprintf';
45use printusage;
46use parse2;
47
48use File::Basename;
49
50sub new
51{
52    my $class = shift (@_);
53    my ($mode,$argv,$options,$opt_listall_options) = @_;
54
55    my $self = { 'xml' => 0, 'mode' => $mode };
56
57    # general options available to all plugins
58    my $arguments = $options->{'args'};
59    my $intArgLeftinAfterParsing = parse2::parse($argv,$arguments,$self,"allow_extra_options");
60    # Parse returns -1 if something has gone wrong
61    if ($intArgLeftinAfterParsing == -1)
62    {
63    &PrintUsage::print_txt_usage($options, "{import.params}");
64    die "\n";
65    }
66   
67    my $language = $self->{'language'};
68    # If $language has been specified, load the appropriate resource bundle
69    # (Otherwise, the default resource bundle will be loaded automatically)
70    if ($language && $language =~ /\S/) {
71    &gsprintf::load_language_specific_resource_bundle($language);
72    }
73
74    if ($self->{'listall'}) {
75    if ($self->{'xml'}) {
76        &PrintUsage::print_xml_usage($opt_listall_options);
77    }
78    else
79    {
80        &PrintUsage::print_txt_usage($opt_listall_options,"{export.params}");
81    }
82    die "\n";
83    }
84
85
86    if ($self->{'xml'}) {
87        &PrintUsage::print_xml_usage($options);
88    print "\n";
89    return bless $self, $class;
90    }
91
92    if ($self->{'gli'}) { # the gli wants strings to be in UTF-8
93    &gsprintf::output_strings_in_UTF8;
94    }
95   
96    # now check that we had exactly one leftover arg, which should be
97    # the collection name. We don't want to do this earlier, cos
98    # -xml arg doesn't need a collection name
99    # Or if the user specified -h, then we output the usage also
100
101    if ($intArgLeftinAfterParsing != 1 || (@$argv && $argv->[0] =~ /^\-+h/))
102    {
103    &PrintUsage::print_txt_usage($options, "{import.params}");
104    die "\n";
105    }
106
107    $self->{'close_out'} = 0;
108    my $out = $self->{'out'};
109    if ($out !~ /^(STDERR|STDOUT)$/i) {
110    open (OUT, ">$out") ||
111        (&gsprintf(STDERR, "{common.cannot_open_output_file}: $!\n", $out) && die);
112    $out = 'inexport::OUT';
113    $self->{'close_out'} = 1;
114    }
115    $out->autoflush(1);
116    $self->{'out'} = $out;
117
118    # @ARGV should be only one item, the name of the collection
119    $self->{'collection'} = shift @$argv;
120
121    if ((defined $self->{'jobs'}) && ($self->{'jobs'}>1)) {
122    require ParallelInexport;
123    }
124
125    return bless $self, $class;
126}
127
128# Simplified version of the contstructor for use with CGI scripts
129sub newCGI
130{
131    my $class = shift (@_);
132    my ($mode,$collect,$gsdl_cgi,$opt_site) = @_;
133
134    my $self = { 'xml' => 0, 'mode' => $mode };
135
136    $self->{'out'} = STDERR;
137   
138    if (defined $gsdl_cgi) {
139        $self->{'site'} = $opt_site;
140        my $collect_dir = $gsdl_cgi->get_collection_dir($opt_site);
141        $self->{'collectdir'} = $collect_dir;
142    }
143    else { 
144        $self->{'site'} = "";
145        $self->{'collectdir'} = &util::filename_cat($ENV{'GSDLHOME'},"collect");
146    }
147    $self->{'faillog'} = "";
148   
149    $self->{'collection'} = $collect;
150
151    return bless $self, $class;
152}
153sub get_collection
154{
155    my $self = shift @_;
156   
157    return $self->{'collection'};
158}
159
160
161sub read_collection_cfg
162{
163    my $self = shift @_;
164    my ($collection,$options) = @_;
165
166    my $collectdir = $self->{'collectdir'};
167    my $site       = $self->{'site'};
168    my $out        = $self->{'out'};
169     
170    if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "") {
171    &PrintUsage::print_txt_usage($options, "{import.params}");
172    die "\n";
173    }
174
175    # add collection's perllib dir  into include path in
176    # case we have collection specific modules
177    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
178
179    # check that we can open the faillog
180    my $faillog = $self->{'faillog'};
181    if ($faillog eq "") {
182    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
183    }
184    open (FAILLOG, ">$faillog") ||
185    (&gsprintf(STDERR, "{import.cannot_open_fail_log}\n", $faillog) && die);
186
187   
188    my $faillogname = $faillog;
189    $faillog = 'inexport::FAILLOG';
190    $faillog->autoflush(1);
191    $self->{'faillog'} = $faillog;
192    $self->{'faillogname'} = $faillogname;
193
194    # Read in the collection configuration file.
195    my ($config_filename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
196    my $collectcfg = &colcfg::read_collection_cfg ($config_filename, $gs_mode);
197
198    return ($config_filename,$collectcfg);
199}
200
201sub set_collection_options
202{
203    my $self = shift @_;
204    my ($collectcfg) = @_;
205
206    my $inexport_mode = $self->{'mode'};
207
208    my $verbosity  = $self->{'verbosity'};
209    my $debug      = $self->{'debug'};
210    my $importdir  = $self->{'importdir'};
211    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'} || "";
212    my $out        = $self->{'out'};
213
214    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
215    if (!defined($collectcfg->{'infodbtype'}))
216    {
217      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
218    }
219    if ($collectcfg->{'infodbtype'} eq "gdbm-txtgz") {
220    # we can't use the text version for archives dbs.
221    $collectcfg->{'infodbtype'} = "gdbm";
222    }
223    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
224    $importdir = $collectcfg->{'importdir'};
225    }
226    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
227    $archivedir = $collectcfg->{'archivedir'};
228    }
229    # fill in the default import and archives directories if none
230    # were supplied, turn all \ into / and remove trailing /
231    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
232    $importdir =~ s/[\\\/]+/\//g;
233    $importdir =~ s/\/$//;
234    if (!-e $importdir) {
235    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
236    die "\n";
237    }
238    $self->{'importdir'} = $importdir;
239
240    if ($archivedir eq "") {
241    if ($inexport_mode eq "import") {
242        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
243    }
244    elsif ($inexport_mode eq "export") {
245        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export");
246    }
247    else {
248        print STDERR "Warning: Unrecognized import/export mode '$inexport_mode'\n";
249        print STDERR "         Defaulting to 'archives' for file output\n";
250        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
251    }
252    }
253
254    $archivedir =~ s/[\\\/]+/\//g;
255    $archivedir =~ s/\/$//;
256    $self->{'archivedir'} = $archivedir;
257
258    if ($verbosity !~ /\d+/) {
259    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
260        $verbosity = $collectcfg->{'verbosity'};
261    } else {
262        $verbosity = 2; # the default
263    }
264    }
265    $self->{'verbosity'} = $verbosity;
266
267    if (defined $collectcfg->{'manifest'} && $self->{'manifest'} eq "") {
268    $self->{'manifest'} = $collectcfg->{'manifest'};
269    }
270
271    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
272    if ($collectcfg->{'gzip'} =~ /^true$/i) {
273        $self->{'gzip'} = 1;
274    }
275    }
276
277    if ($self->{'maxdocs'} !~ /\-?\d+/) {
278    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
279        $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
280    } else {
281        $self->{'maxdocs'} = -1; # the default
282    }
283    }
284
285    if ((defined $self->{'groupsize'}) && ($self->{'groupsize'} == 1)) {
286    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
287        $self->{'groupsize'} = $collectcfg->{'groupsize'};
288    }
289    }
290
291    if (!defined $self->{'OIDtype'}
292    || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) {
293    if (defined $collectcfg->{'OIDtype'}
294        && $collectcfg->{'OIDtype'} =~ /^(hash|incremental|assigned|dirname)$/) {
295        $self->{'OIDtype'} = $collectcfg->{'OIDtype'};
296    } else {
297        $self->{'OIDtype'} = "hash"; # the default
298    }
299    }
300
301    if ((!defined $self->{'OIDmetadata'}) || ($self->{'OIDmetadata'} eq "")) {
302    if (defined $collectcfg->{'OIDmetadata'}) {
303        $self->{'OIDmetadata'} = $collectcfg->{'OIDmetadata'};
304    } else {
305        $self->{'OIDmetadata'} = "dc.Identifier"; # the default
306    }
307    }
308
309    my $sortmeta = $self->{'sortmeta'};
310    if (defined $collectcfg->{'sortmeta'} && (!defined $sortmeta || $sortmeta eq "")) {
311    $sortmeta = $collectcfg->{'sortmeta'};
312    }
313    # sortmeta cannot be used with group size
314    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
315    if (defined $sortmeta && $self->{'groupsize'} > 1) {
316    &gsprintf($out, "{import.cannot_sort}\n\n");
317    $sortmeta = undef;
318    }
319    $self->{'sortmeta'} = $sortmeta;
320
321    if (defined $collectcfg->{'removeprefix'} && $self->{'removeprefix'} eq "") {
322    $self->{'removeprefix'} = $collectcfg->{'removeprefix'};
323    }
324   
325    if (defined $collectcfg->{'removesuffix'} && $self->{'removesuffix'} eq "") {
326    $self->{'removesuffix'} = $collectcfg->{'removesuffix'};
327    }
328    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
329    $self->{'debug'} = 1;
330    }
331    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
332    $self->{'gli'} = 1;
333    }
334    $self->{'gli'} = 0 unless defined $self->{'gli'};
335       
336    # check keepold and removeold
337    my $checkdir = ($inexport_mode eq "import") ? "archives" : "export";
338
339    my ($removeold, $keepold, $incremental, $incremental_mode)
340    = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
341                           $self->{'incremental'}, $checkdir,
342                           $collectcfg);
343
344    $self->{'removeold'}        = $removeold;
345    $self->{'keepold'}          = $keepold;
346    $self->{'incremental'}      = $incremental;
347    $self->{'incremental_mode'} = $incremental_mode;
348}
349
350sub process_files
351{
352    my $self = shift @_;
353    my ($config_filename,$collectcfg) = @_;
354
355    my $inexport_mode = $self->{'mode'};
356
357    my $verbosity   = $self->{'verbosity'};
358    my $debug       = $self->{'debug'};
359
360    my $importdir   = $self->{'importdir'};
361    my $archivedir = $self->{'archivedir'} || $self->{'exportdir'};
362
363    my $incremental = $self->{'incremental'};
364    my $incremental_mode = $self->{'incremental_mode'};
365
366    my $removeold   = $self->{'removeold'};
367    my $keepold     = $self->{'keepold'};
368
369    my $saveas      = $self->{'saveas'};
370    my $OIDtype     = $self->{'OIDtype'};
371    my $OIDmetadata = $self->{'OIDmetadata'};
372
373    my $out         = $self->{'out'};
374    my $faillog     = $self->{'faillog'};
375
376    my $maxdocs     = $self->{'maxdocs'};
377    my $gzip        = $self->{'gzip'};
378    my $groupsize   = $self->{'groupsize'};
379    my $sortmeta    = $self->{'sortmeta'};
380
381    my $removeprefix = $self->{'removeprefix'};
382    my $removesuffix = $self->{'removesuffix'};
383
384    my $gli          = $self->{'gli'};
385
386    my $jobs         = $self->{'jobs'};
387    my $epoch        = $self->{'epoch'};
388
389    # related to export
390    my $xsltfile         = $self->{'xsltfile'};
391    my $group_marc       = $self->{'group_marc'};
392    my $mapping_file     = $self->{'mapping_file'};
393    my $xslt_mets        = $self->{'xslt_mets'};
394    my $xslt_txt         = $self->{'xslt_txt'};
395    my $fedora_namespace = $self->{'fedora_namespace'};
396    my $metadata_prefix  = $self->{'metadata_prefix'};
397
398    if ($inexport_mode eq "import") {
399    print STDERR "<Import>\n" if $gli;
400    }
401    else {
402    print STDERR "<export>\n" if $gli;
403    }
404
405    my $manifest_lookup = new manifest($collectcfg->{'infodbtype'},$archivedir);
406    if ($self->{'manifest'} ne "") {
407    my $manifest_filename = $self->{'manifest'};
408
409    if (!&util::filename_is_absolute($manifest_filename)) {
410        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
411    }
412
413    $self->{'manifest'} =~ s/[\\\/]+/\//g;
414    $self->{'manifest'} =~ s/\/$//;
415
416    $manifest_lookup->parse($manifest_filename);
417    }
418
419    my $manifest = $self->{'manifest'};
420
421    # load all the plugins
422    my $plugins = [];
423    if (defined $collectcfg->{'plugin'}) {
424    $plugins = $collectcfg->{'plugin'};
425    }
426
427    my $plugin_incr_mode = $incremental_mode;
428    if ($manifest ne "") {
429    # if we have a manifest file, then we pretend we are fully incremental for plugins
430    $plugin_incr_mode = "all";
431    }
432    #some global options for the plugins
433    my @global_opts = ();
434
435    my $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts, $plugin_incr_mode);
436    if (scalar(@$pluginfo) == 0) {
437    &gsprintf($out, "{import.no_plugins_loaded}\n");
438    die "\n";
439    }
440
441    # remove the old contents of the archives directory (and tmp
442    # directory) if needed
443
444    if ($removeold) {
445    if (-e $archivedir) {
446        &gsprintf($out, "{import.removing_archives}\n");
447        &util::rm_r ($archivedir);
448    }
449    my $tmpdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "tmp");
450    $tmpdir =~ s/[\\\/]+/\//g;
451    $tmpdir =~ s/\/$//;
452    if (-e $tmpdir) {
453        &gsprintf($out, "{import.removing_tmpdir}\n");
454        &util::rm_r ($tmpdir);
455    }
456    }
457
458    # create the archives dir if needed
459    &util::mk_all_dir($archivedir);
460
461    # read the archive information file
462
463    # BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files (won't do anything for other infodbtypes)
464    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-doc"));
465    &util::rename_ldb_or_bdb_file(&util::filename_cat($archivedir, "archiveinf-src"));
466
467    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir);
468    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
469                           
470    my $archive_info = new arcinfo ($collectcfg->{'infodbtype'});
471    $archive_info->load_info ($arcinfo_doc_filename);
472
473    if ($manifest eq "") {
474    # Load in list of files in import folder from last import (if present)
475    $archive_info->load_prev_import_filelist ($arcinfo_src_filename);
476    }
477
478    ####Use Plugout####
479    my $plugout;
480
481    if ($inexport_mode eq "import") {
482    if (defined $collectcfg->{'plugout'}) {
483        # If a plugout was specified in the collect.cfg file, assume it is sensible
484        # We can't check the name because it could be anything, if it is a custom plugout
485        $plugout = $collectcfg->{'plugout'};
486    }
487    else{
488        if ($saveas !~ /^(GreenstoneXML|GreenstoneMETS)$/) {
489        push @$plugout,"GreenstoneXMLPlugout";
490        }
491        else{
492        push @$plugout,$saveas."Plugout";
493        }
494    }
495    }
496    else {
497    if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'} =~ /^(.*METS|DSpace|MARCXML)Plugout/) {
498        $plugout = $collectcfg->{'plugout'};
499    }
500    else{
501        if ($saveas !~ /^(GreenstoneMETS|FedoraMETS|DSpace|MARCXML)$/) {
502        push @$plugout,"GreenstoneMETSPlugout";
503        }
504        else{
505        push @$plugout,$saveas."Plugout";
506        }
507    }
508    }
509   
510    my $plugout_name = $plugout->[0];
511
512    push @$plugout,("-output_info",$archive_info)  if (defined $archive_info);
513    push @$plugout,("-verbosity",$verbosity)       if (defined $verbosity);
514    push @$plugout,("-debug")                      if ($debug);
515    push @$plugout,("-group_size",$groupsize)      if (defined $groupsize);
516    push @$plugout,("-gzip_output")                if ($gzip);
517    push @$plugout,("-output_handle",$out)         if (defined $out);
518
519    push @$plugout,("-xslt_file",$xsltfile)        if (defined $xsltfile && $xsltfile ne "");
520
521    if ($plugout_name =~ m/^MARCXMLPlugout$/) {
522    push @$plugout,("-group")                      if ($group_marc);
523    push @$plugout,("-mapping_file",$mapping_file) if (defined $mapping_file && $mapping_file ne "");
524    }
525    if ($plugout_name =~ m/^.*METSPlugout$/) {
526    push @$plugout,("-xslt_mets",$xslt_mets)       if (defined $xslt_mets && $xslt_mets ne "");
527    push @$plugout,("-xslt_txt",$xslt_txt)         if (defined $xslt_txt && $xslt_txt ne "");
528    }
529
530    if ($plugout_name eq "FedoraMETSPlugout") {
531    push @$plugout,("-fedora_namespace",$fedora_namespace) if (defined $fedora_namespace && $fedora_namespace ne "");
532    }
533   
534    if ($plugout_name eq "DSpacePlugout") {
535    push @$plugout,("-metadata_prefix",$metadata_prefix) if (defined $metadata_prefix && $metadata_prefix ne "");   
536    }
537
538    my $processor = &plugout::load_plugout($plugout);                       
539    $processor->setoutputdir ($archivedir);
540    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
541    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
542   
543    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
544   
545    if ($removeold) {
546        # occasionally, plugins may want to do something on remove
547        # old, eg pharos image indexing
548    &plugin::remove_all($pluginfo, $importdir, $processor, $maxdocs, $gli);
549    }
550
551    # process the import directory
552    my $block_hash = {};
553    $block_hash->{'new_files'} = {};
554    $block_hash->{'reindex_files'} = {};
555    my $metadata = {};
556   
557    # global blocking pass may set up some metadata
558    &plugin::file_block_read($pluginfo, $importdir, "", $block_hash, $metadata, $gli);
559   
560    if ($manifest ne "") {
561    #
562    # 1. Process delete files first
563    #
564    my @deleted_files = keys %{$manifest_lookup->{'delete'}};
565    my @full_deleted_files = ();
566
567    # ensure all filenames are absolute
568    foreach my $df (@deleted_files) {
569        my $full_df =
570        (&util::filename_is_absolute($df))
571        ? $df
572        : &util::filename_cat($importdir,$df);
573
574        if (-d $full_df) {
575        &add_dir_contents_to_list($full_df, \@full_deleted_files);
576        } else {
577        push(@full_deleted_files,$full_df);
578        }
579    }
580   
581    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_deleted_files);
582    mark_docs_for_deletion($archive_info,{},
583                   \@full_deleted_files,
584                   $archivedir, $verbosity, "delete");
585
586
587    #
588    # 2. Now files for reindexing
589    #
590
591    my @reindex_files = keys %{$manifest_lookup->{'reindex'}};
592    my @full_reindex_files = ();
593    # ensure all filenames are absolute
594    foreach my $rf (@reindex_files) {       
595        my $full_rf =
596        (&util::filename_is_absolute($rf))
597        ? $rf
598        : &util::filename_cat($importdir,$rf);
599
600        if (-d $full_rf) {
601        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
602        } else {
603        push(@full_reindex_files,$full_rf);
604        }
605    }
606   
607    &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@full_reindex_files);
608    mark_docs_for_deletion($archive_info,{},\@full_reindex_files, $archivedir,$verbosity, "reindex");
609
610    # And now to ensure the new version of the file processed by
611    # appropriate plugin, we need to add it to block_hash reindex list
612    foreach my $full_rf (@full_reindex_files) {
613        $block_hash->{'reindex_files'}->{$full_rf} = 1;
614    }
615
616
617    #
618    # 3. Now finally any new files - add to block_hash new_files list
619    #
620
621    my @new_files = keys %{$manifest_lookup->{'index'}};
622    my @full_new_files = ();
623
624    foreach my $nf (@new_files) {
625        # ensure filename is absolute
626        my $full_nf =
627        (&util::filename_is_absolute($nf))
628        ? $nf
629        : &util::filename_cat($importdir,$nf);
630
631        if (-d $full_nf) {
632        &add_dir_contents_to_list($full_nf, \@full_new_files);
633        } else {
634        push(@full_new_files,$full_nf);
635        }
636    }
637
638    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-src", $archivedir);
639    my $arcinfodb_map = {};
640    &dbutil::read_infodb_file($collectcfg->{'infodbtype'}, $arcinfo_src_filename, $arcinfodb_map);
641    foreach my $f (@full_new_files) {
642        # check that we haven't seen it already
643        if (defined $arcinfodb_map->{$f}) {
644        # TODO make better warning
645        print STDERR "Warning: $f already in src archive, \n";
646        } else {
647        $block_hash->{'new_files'}->{$f} = 1;
648        }
649    }
650
651    undef $arcinfodb_map;
652    }
653    else {
654    # if incremental, we read through the import folder to see whats changed.
655
656    if ($incremental || $incremental_mode eq "onlyadd") {
657        prime_doc_oid_count($archivedir);
658
659        # Can now work out which files were new, already existed, and have
660        # been deleted
661       
662        new_vs_old_import_diff($archive_info,$block_hash,$importdir,
663                   $archivedir,$verbosity,$incremental_mode);
664       
665        my @new_files = sort keys %{$block_hash->{'new_files'}};
666        if (scalar(@new_files>0)) {
667        print STDERR "New files and modified metadata files since last import:\n  ";
668        print STDERR join("\n  ",@new_files), "\n";
669        }
670
671        if ($incremental) {
672               # only look for deletions if we are truely incremental
673        my @deleted_files = sort keys %{$block_hash->{'deleted_files'}};
674        # Filter out any in gsdl/tmp area
675        my @filtered_deleted_files = ();
676        my $gsdl_tmp_area = &util::filename_cat($ENV{'GSDLHOME'}, "tmp");
677        my $collect_tmp_area = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp");
678        $gsdl_tmp_area = &util::filename_to_regex($gsdl_tmp_area);
679        $collect_tmp_area = &util::filename_to_regex($collect_tmp_area);
680                 
681        foreach my $df (@deleted_files) {
682            next if ($df =~ m/^$gsdl_tmp_area/);
683            next if ($df =~ m/^$collect_tmp_area/);
684           
685            push(@filtered_deleted_files,$df);
686        }       
687       
688
689        @deleted_files = @filtered_deleted_files;
690       
691        if (scalar(@deleted_files)>0) {
692            print STDERR "Files deleted since last import:\n  ";
693            print STDERR join("\n  ",@deleted_files), "\n";
694       
695       
696            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@deleted_files);
697           
698            mark_docs_for_deletion($archive_info,$block_hash,\@deleted_files, $archivedir,$verbosity, "delete");
699        }
700       
701        my @reindex_files = sort keys %{$block_hash->{'reindex_files'}};
702       
703        if (scalar(@reindex_files)>0) {
704            print STDERR "Files to reindex since last import:\n  ";
705            print STDERR join("\n  ",@reindex_files), "\n";
706            &plugin::remove_some($pluginfo, $collectcfg->{'infodbtype'}, $archivedir, \@reindex_files);
707            mark_docs_for_deletion($archive_info,$block_hash,\@reindex_files, $archivedir,$verbosity, "reindex");
708        }
709               
710        }       
711    }
712    }
713
714    # Check for existence of the file that's to contain earliestDateStamp in archivesdir
715    # Do nothing if the file already exists (file exists on incremental build).
716    # If the file doesn't exist, as happens on full build, create it and write out the current datestamp into it
717    # In buildcol, read the file's contents and set the earliestdateStamp in GS2's build.cfg / GS3's buildconfig.xml
718    # In doc.pm have set_oaiLastModified similar to set_lastmodified, and create the doc fields
719    # oailastmodified and oailastmodifieddate
720    my $earliestDatestampFile = &util::filename_cat($archivedir, "earliestDatestamp");
721    if (!-f $earliestDatestampFile && -d $archivedir) {
722    my $current_time_in_seconds = time; # in seconds
723
724    if(open(FOUT, ">$earliestDatestampFile")) {
725        # || (&gsprintf(STDERR, "{common.cannot_open}: $!\n", $earliestDatestampFile) && die);
726        print FOUT $current_time_in_seconds;
727        close(FOUT);
728    }
729    else {
730        &gsprintf(STDERR, "{import.cannot_write_earliestdatestamp}\n", $earliestDatestampFile);
731    }
732
733    }
734
735    # now, whichever mode we are in, we can process the entire import folder
736    if ((defined $jobs) && ($jobs > 1))
737    {
738    # if jobs are set to >1, run in parallel using MPI helper
739    # [hs, 1 july 2010]
740    &ParallelInexport::farm_out_processes($jobs, $epoch, $importdir, $block_hash,
741                          $self->{'collection'}, $self->{'site'});
742    }
743    else
744    {
745    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
746    }
747   
748   
749    if ($saveas eq "FedoraMETS") {
750    # create collection "doc obj" for Fedora that contains
751    # collection-level metadata
752   
753    my $doc_obj = new doc($config_filename,"nonindexed_doc","none");
754    $doc_obj->set_OID("collection");
755   
756    my $col_name = undef;
757    my $col_meta = $collectcfg->{'collectionmeta'};
758   
759    if (defined $col_meta) {       
760        store_collectionmeta($col_meta,"collectionname",$doc_obj); # in GS3 this is a collection's name
761        store_collectionmeta($col_meta,"collectionextra",$doc_obj); # in GS3 this is a collection's description     
762    }
763    $processor->process($doc_obj);
764    }
765
766    &plugin::end($pluginfo, $processor);
767
768    &plugin::deinit($pluginfo, $processor);
769
770    # Store the value of OIDCount (used in doc.pm) so it can be
771    # restored correctly to this value on an incremental build
772    store_doc_oid_count($archivedir);
773
774    # write out the archive information file
775    $processor->close_file_output() if (defined $groupsize) && ($groupsize > 1);
776    $processor->close_group_output() if $processor->is_group();
777
778    # for backwards compatability with archvies.inf file
779    if ($arcinfo_doc_filename =~ m/(contents)|(\.inf)$/) {
780    $archive_info->save_info($arcinfo_doc_filename);
781    }
782    else {
783    $archive_info->save_revinfo_db($arcinfo_src_filename);
784    }
785
786    return $pluginfo;
787}
788
789
790sub generate_statistics
791{
792    my $self = shift @_;
793    my ($pluginfo) = @_;
794
795    my $inexport_mode = $self->{'mode'};
796
797    my $statsfile   = $self->{'statsfile'};
798    my $out         = $self->{'out'};
799    my $faillogname = $self->{'faillogname'};
800    my $gli         = $self->{'gli'};
801    my $jobs        = $self->{'jobs'};
802
803    # write out import stats
804
805    if ((!defined $jobs) || ($jobs == 1))
806    {
807    # only output statistics if there are multiple jobs
808    # [hs, 1 july 2010]
809
810    my $close_stats = 0;
811    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
812        if (open (STATS, ">$statsfile")) {
813        $statsfile = 'inexport::STATS';
814        $close_stats = 1;
815        } else {
816        &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
817        &gsprintf($out, "{import.stats_backup}\n");
818        $statsfile = 'STDERR';
819        }
820    }
821   
822    &gsprintf($out, "\n");
823    &gsprintf($out, "*********************************************\n");
824    &gsprintf($out, "{$inexport_mode.complete}\n");
825    &gsprintf($out, "*********************************************\n");
826   
827    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
828    if ($close_stats) {
829        close STATS;
830    }
831    }
832
833    close OUT if $self->{'close_out'};
834    close FAILLOG;
835}
836
837
838sub store_collectionmeta
839{
840    my ($collectionmeta,$field,$doc_obj) = @_;
841   
842    my $section = $doc_obj->get_top_section();
843   
844    my $field_hash = $collectionmeta->{$field};
845   
846    foreach my $k (keys %$field_hash)
847    {
848    my $val = $field_hash->{$k};
849   
850    ### print STDERR "*** $k = $field_hash->{$k}\n";
851   
852    my $md_label = "ex.$field";
853   
854   
855    if ($k =~ m/^\[l=(.*?)\]$/)
856    {
857       
858        my $md_suffix = $1;
859        $md_label .= "^$md_suffix";
860    }
861   
862   
863    $doc_obj->add_utf8_metadata($section,$md_label, $val);
864   
865    # see collConfigxml.pm: GS2's "collectionextra" is called "description" in GS3,
866    # while "collectionname" in GS2 is called "name" in GS3.
867    # Variable $nameMap variable in collConfigxml.pm maps between GS2 and GS3
868    if (($md_label eq "ex.collectionname^en") || ($md_label eq "ex.collectionname"))
869    {
870        $doc_obj->add_utf8_metadata($section,"dc.Title", $val);
871    }
872   
873    }
874}
875
876
877sub oid_count_file {
878    my ($archivedir) = @_;
879    return &util::filename_cat ($archivedir, "OIDcount");
880}
881
882
883sub prime_doc_oid_count
884{
885    my ($archivedir) = @_;
886    my $oid_count_filename = &oid_count_file($archivedir);
887
888    if (-e $oid_count_filename) {
889    if (open(OIDIN,"<$oid_count_filename")) {
890        my $OIDcount = <OIDIN>;
891        chomp $OIDcount;       
892        close(OIDIN);
893
894        $doc::OIDcount = $OIDcount;     
895    }
896    else {     
897        &gsprintf(STDERR, "{import.cannot_read_OIDcount}\n", $oid_count_filename);
898    }
899    }
900   
901}
902
903sub store_doc_oid_count
904{
905    # Use the file "OIDcount" in the archives directory to record
906    # what value doc.pm got up to
907
908    my ($archivedir) = @_;
909    my $oid_count_filename = &oid_count_file($archivedir);
910
911
912    if (open(OIDOUT,">$oid_count_filename")) {
913    print OIDOUT $doc::OIDcount, "\n";
914       
915    close(OIDOUT);
916    }
917    else {
918    &gsprintf(STDERR, "{import.cannot_write_OIDcount}\n", $oid_count_filename);
919    }
920}
921
922
923
924sub new_vs_old_import_diff
925{
926    my ($archive_info,$block_hash,$importdir,$archivedir,$verbosity,$incremental_mode) = @_;
927
928    # Get the infodbtype value for this collection from the arcinfo object
929    my $infodbtype = $archive_info->{'infodbtype'};
930
931    # in this method, we want to know if metadata files are modified or not.
932    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
933
934    my $archiveinf_timestamp = -M $arcinfo_doc_filename;
935
936    # First convert all files to absolute form
937    # This is to support the situation where the import folder is not
938    # the default
939   
940    my $prev_all_files = $archive_info->{'prev_import_filelist'};
941    my $full_prev_all_files = {};
942
943    foreach my $prev_file (keys %$prev_all_files) {
944
945    if (!&util::filename_is_absolute($prev_file)) {
946        my $full_prev_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$prev_file);
947        $full_prev_all_files->{$full_prev_file} = $prev_file;
948    }
949    else {
950        $full_prev_all_files->{$prev_file} = $prev_file;
951    }
952    }
953
954
955    # Figure out which are the new files, existing files and so
956    # by implication the files from the previous import that are not
957    # there any more => mark them for deletion
958    foreach my $curr_file (keys %{$block_hash->{'all_files'}}) {
959   
960    my $full_curr_file = $curr_file;
961
962    # entry in 'all_files' is moved to either 'existing_files',
963    # 'deleted_files', 'new_files', or 'new_or_modified_metadata_files'
964
965    if (!&util::filename_is_absolute($curr_file)) {
966        # add in import dir to make absolute
967        $full_curr_file = &util::filename_cat($importdir,$curr_file);
968    }
969
970    # figure out if new file or not
971    if (defined $full_prev_all_files->{$full_curr_file}) {
972        # delete it so that only files that need deleting are left
973        delete $full_prev_all_files->{$full_curr_file};
974       
975        # had it before. is it a metadata file?
976        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
977       
978        # is it modified??
979        if (-M $full_curr_file < $archiveinf_timestamp) {
980            print STDERR "*** Detected a *modified metadata* file: $full_curr_file\n" if $verbosity >= 2;
981            # its newer than last build
982            $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
983        }
984        }
985        else {
986        if ($incremental_mode eq "all") {
987           
988            # had it before
989            $block_hash->{'existing_files'}->{$full_curr_file} = 1;
990           
991        }
992        else {
993            # Warning in "onlyadd" mode, but had it before!
994            print STDERR "Warning: File $full_curr_file previously imported.\n";
995            print STDERR "         Treating as new file\n";
996           
997            $block_hash->{'new_files'}->{$full_curr_file} = 1;
998           
999        }
1000        }
1001    }
1002    else {
1003        if ($block_hash->{'metadata_files'}->{$full_curr_file}) {
1004        # the new file is the special sort of file greenstone uses
1005        # to attach metadata to src documents
1006        # i.e metadata.xml
1007        # (but note, the filename used is not constrained in
1008        # Greenstone to always be this)
1009
1010        print STDERR "*** Detected *new* metadata file: $full_curr_file\n" if $verbosity >= 2;
1011        $block_hash->{'new_or_modified_metadata_files'}->{$full_curr_file} = 1;
1012        }
1013        else {
1014        $block_hash->{'new_files'}->{$full_curr_file} = 1;
1015        }
1016    }
1017
1018   
1019    delete $block_hash->{'all_files'}->{$curr_file};
1020    }
1021
1022
1023
1024
1025    # Deal with complication of new or modified metadata files by forcing
1026    # everything from this point down in the file hierarchy to
1027    # be freshly imported. 
1028    #
1029    # This may mean files that have not changed are reindexed, but does
1030    # guarantee by the end of processing all new metadata is correctly
1031    # associated with the relevant document(s).
1032
1033    foreach my $new_mdf (keys %{$block_hash->{'new_or_modified_metadata_files'}}) {
1034    my ($fileroot,$situated_dir,$ext) = fileparse($new_mdf, "\\.[^\\.]+\$");
1035
1036    $situated_dir =~ s/[\\\/]+$//; # remove tailing slashes
1037    $situated_dir = &util::filename_to_regex($situated_dir); # need to escape windows slash \ and brackets in regular expression
1038   
1039    # Go through existing_files, and mark anything that is contained
1040    # within 'situated_dir' to be reindexed (in case some of the metadata
1041    # attaches to one of these files)
1042
1043    my $reindex_files = [];
1044
1045    foreach my $existing_f (keys %{$block_hash->{'existing_files'}}) {
1046   
1047        if ($existing_f =~ m/^$situated_dir/) {
1048
1049        print STDERR "**** Existing file $existing_f\nis located within\n$situated_dir\n";
1050
1051        push(@$reindex_files,$existing_f);
1052        $block_hash->{'reindex_files'}->{$existing_f} = 1;
1053        delete $block_hash->{'existing_files'}->{$existing_f};
1054
1055        }
1056    }
1057   
1058    # metadata file needs to be in new_files list so parsed by MetadataXMLPlug
1059    # (or equivalent)
1060    $block_hash->{'new_files'}->{$new_mdf} = 1;
1061
1062    }
1063
1064    # go through remaining existing files and work out what has changed and needs to be reindexed.
1065    my @existing_files = sort keys %{$block_hash->{'existing_files'}};
1066
1067    my $reindex_files = [];
1068
1069    foreach my $existing_filename (@existing_files) {
1070    if (-M $existing_filename < $archiveinf_timestamp) {
1071        # file is newer than last build
1072       
1073        my $existing_file = $existing_filename;
1074        #my $collectdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'});
1075
1076        #my $collectdir_resafe = &util::filename_to_regex($collectdir);
1077        #$existing_file =~ s/^$collectdir_resafe(\\|\/)?//;
1078       
1079        print STDERR "**** Reindexing existing file: $existing_file\n";
1080
1081        push(@$reindex_files,$existing_file);
1082        $block_hash->{'reindex_files'}->{$existing_filename} = 1;
1083    }
1084
1085    }
1086
1087   
1088    # By this point full_prev_all_files contains the files
1089    # mentioned in archiveinf-src.db but are not in the 'import'
1090    # folder (or whatever was specified through -importdir ...)
1091
1092    # This list can contain files that were created in the 'tmp' or
1093    # 'cache' areas (such as screen-size and thumbnail images).
1094    #
1095    # In building the final list of files to delete, we test to see if
1096    # it exists on the filesystem and if it does (unusual for a "normal"
1097    # file in import, but possible in the case of 'tmp' files),
1098    # supress it from going into the final list
1099
1100    my $collectdir = $ENV{'GSDLCOLLECTDIR'};
1101
1102    my @deleted_files = values %$full_prev_all_files;
1103    map { my $curr_file = $_;
1104      my $full_curr_file = $curr_file;
1105
1106      if (!&util::filename_is_absolute($curr_file)) {
1107          # add in import dir to make absolute
1108
1109          $full_curr_file = &util::filename_cat($collectdir,$curr_file);
1110      }
1111
1112
1113      if (!-e $full_curr_file) {
1114          $block_hash->{'deleted_files'}->{$curr_file} = 1;
1115      }
1116      } @deleted_files;
1117
1118
1119
1120}
1121
1122
1123# this is used to delete "deleted" docs, and to remove old versions of "changed" docs
1124# $mode is 'delete' or 'reindex'
1125sub mark_docs_for_deletion
1126{
1127    my ($archive_info,$block_hash,$deleted_files,$archivedir,$verbosity,$mode) = @_;
1128
1129    my $mode_text = "deleted from index";
1130    if ($mode eq "reindex") {
1131    $mode_text = "reindexed";
1132    }
1133
1134    # Get the infodbtype value for this collection from the arcinfo object
1135    my $infodbtype = $archive_info->{'infodbtype'};
1136
1137    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
1138    my $arcinfo_src_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-src", $archivedir);
1139
1140
1141    # record files marked for deletion in arcinfo
1142    foreach my $file (@$deleted_files) {
1143    # use 'archiveinf-src' info database file to look up all the OIDs
1144    # that this file is used in (note in most cases, it's just one OID)
1145   
1146    my $src_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_src_filename, $file);
1147    my $oids = $src_rec->{'oid'};
1148    my $file_record_deleted = 0;
1149
1150    # delete the src record
1151    my $src_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_src_filename, "append");
1152    &dbutil::delete_infodb_entry($infodbtype, $src_infodb_file_handle, $file);
1153    &dbutil::close_infodb_write_handle($infodbtype, $src_infodb_file_handle);
1154
1155
1156    foreach my $oid (@$oids) {
1157
1158        # find the source doc (the primary file that becomes this oid)
1159        my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
1160        my $doc_source_file = $doc_rec->{'src-file'}->[0];
1161        if (!&util::filename_is_absolute($doc_source_file)) {
1162        $doc_source_file = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$doc_source_file);
1163        }
1164
1165        if ($doc_source_file ne $file) {
1166        # its an associated or metadata file
1167       
1168        # mark source doc for reimport as one of its assoc files has changed or deleted
1169        $block_hash->{'reindex_files'}->{$doc_source_file} = 1;
1170       
1171        }
1172        my $curr_status = $archive_info->get_status_info($oid);
1173        if (defined($curr_status) && (($curr_status ne "D"))) {
1174        if ($verbosity>1) {
1175            print STDERR "$oid ($doc_source_file) marked to be $mode_text on next buildcol.pl\n";
1176        }
1177        # mark oid for deletion (it will be deleted or reimported)
1178        $archive_info->set_status_info($oid,"D");
1179        my $val = &dbutil::read_infodb_rawentry($infodbtype, $arcinfo_doc_filename, $oid);
1180        $val =~ s/^<index-status>(.*)$/<index-status>D/m;
1181
1182        my $val_rec = &dbutil::convert_infodb_string_to_hash($val);
1183        my $doc_infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $arcinfo_doc_filename, "append");
1184
1185        &dbutil::write_infodb_entry($infodbtype, $doc_infodb_file_handle, $oid, $val_rec);
1186        &dbutil::close_infodb_write_handle($infodbtype, $doc_infodb_file_handle);
1187        }
1188    }
1189   
1190    }
1191
1192    # now go through and check that we haven't marked any primary
1193    # files for reindex (because their associated files have
1194    # changed/deleted) when they have been deleted themselves. only in
1195    # delete mode.
1196
1197    if ($mode eq "delete") {
1198    foreach my $file (@$deleted_files) {
1199        if (defined $block_hash->{'reindex_files'}->{$file}) {
1200        delete $block_hash->{'reindex_files'}->{$file};
1201        }
1202    }
1203    }
1204
1205
1206}
1207
1208sub add_dir_contents_to_list {
1209
1210    my ($dirname, $list) = @_;
1211 
1212    # Recur over directory contents.
1213    my (@dir, $subfile);
1214   
1215    # find all the files in the directory
1216    if (!opendir (DIR, $dirname)) {
1217    print STDERR "inexport: WARNING - couldn't read directory $dirname\n";
1218    return -1; # error in processing
1219    }
1220    @dir = readdir (DIR);
1221    closedir (DIR);
1222   
1223    for (my $i = 0; $i < scalar(@dir); $i++) {
1224    my $subfile = $dir[$i];
1225    next if ($subfile =~ m/^\.\.?$/);
1226    next if ($subfile =~ /^\.svn$/);
1227    my $full_file = &util::filename_cat($dirname, $subfile);
1228    if (-d $full_file) {
1229        &add_dir_contents_to_list($full_file, $list);
1230    } else {
1231        push (@$list, $full_file);
1232    }
1233    }
1234   
1235}
1236
1237
12381;
Note: See TracBrowser for help on using the browser.