#!/usr/bin/perl -w ########################################################################### # # import.pl -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This program will import a number of files into a particular collection package import; BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify"); } use arcinfo; use colcfg; use plugin; use docprint; use util; use parsargv; use FileHandle; sub print_usage { print STDOUT "\n"; print STDOUT "import.pl: Converts documents in collections -importdir directory into\n"; print STDOUT " gml documents which are written to the -archivedir directory.\n\n"; print STDOUT " usage: $0 [options] collection-name\n\n"; print STDOUT " options:\n"; print STDOUT " -verbosity number 0=none, 3=lots\n"; print STDOUT " -importdir directory Where the original material lives\n"; print STDOUT " -archivedir directory Where the converted material ends up\n"; print STDOUT " -keepold Will not destroy the current contents of the\n"; print STDOUT " archives directory (the default)\n"; print STDOUT " -removeold Will remove the old contents of the archives\n"; print STDOUT " directory -- use with care\n"; print STDOUT " -gzip Use gzip to compress resulting gml documents\n"; print STDOUT " (don't forget to include ZIPPlug in your plugin\n"; print STDOUT " list when building from compressed documents)\n"; print STDOUT " -maxdocs number Maximum number of documents to import\n"; print STDOUT " -groupsize number Number of GML documents to group into one file\n"; print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n"; print STDOUT " identifiers for each document. \"hash\" (the\n"; print STDOUT " default) hashes the contents of the file and so\n"; print STDOUT " will be the same every time the collection is\n"; print STDOUT " imported. \"incremental\" is a simple document\n"; print STDOUT " count and so will be significantly faster than\n"; print STDOUT " \"hash\". It is not guaranteed to always assign\n"; print STDOUT " the same identifier to a given document though\n"; print STDOUT " and does not allow further documents to be added\n"; print STDOUT " to existing gml archives\n"; print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n"; print STDOUT " building. This will be disabled if groupsize > 1\n"; print STDOUT " -debug Print imported text to STDOUT\n"; print STDOUT " -collectdir directory Collection directory (defaults to " . &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n"; print STDOUT " -out name Filename or handle to print output status to.\n"; print STDOUT " -faillog name Filename to log import failures to.\n"; print STDOUT " The default is /colname/etc/fail.log\n\n"; print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]"; print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i; } &main(); sub main { my ($verbosity, $importdir, $archivedir, $keepold, $removeold, $gzip, $groupsize, $OIDtype, $debug, $maxdocs, $collection, $configfilename, $collectcfg, $pluginfo, $sortmeta, $archive_info_filename, $archive_info, $processor, $out, $faillogname, $collectdir); # note that no defaults are passed for most options as they're set # later (after we check the collect.cfg file) if (!parsargv::parse(\@ARGV, 'verbosity/\d+/', \$verbosity, 'importdir/.*/', \$importdir, 'archivedir/.*/', \$archivedir, 'keepold', \$keepold, 'removeold', \$removeold, 'gzip', \$gzip, 'groupsize/\d+/1', \$groupsize, 'OIDtype/^(hash|incremental)$/', \$OIDtype, 'sortmeta/.*/', \$sortmeta, 'debug', \$debug, 'maxdocs/^\-?\d+/', \$maxdocs, 'collectdir/.*/', \$collectdir, 'out/.*/STDERR', \$out, 'faillog/.*/', \$faillogname)) { &print_usage(); die "\n"; } my $close_out = 0; if ($out !~ /^(STDERR|STDOUT)$/i) { open (OUT, ">$out") || die "Couldn't open output file $out\n"; $out = 'import::OUT'; $close_out = 1; } $out->autoflush(1); # set removeold to false if it has been defined $removeold = 0 if ($keepold); # get and check the collection name if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") { &print_usage(); die "\n"; } # check and/or set fail log file if ($faillogname eq "") { $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log"; } else { my $can_open=1; open (TESTOPEN, ">$faillogname") || ($can_open=0); close (TESTOPEN); if ($can_open==0) { warn "fail.log - cannot write to \"$faillogname\", using default\n \"$ENV{GSDLCOLLECTDIR}/etc/fail.log\" instead.\n"; $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log"; } } # test that default is writable... if ($faillogname eq "$ENV{GSDLCOLLECTDIR}/etc/fail.log") { my $can_open=1; open (TESTOPEN, ">$faillogname") || ($can_open=0); close (TESTOPEN); if ($can_open==0) { warn "warning - cannot write to \"$faillogname\".\n"; $faillogname=""; } } # check sortmeta $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/; if (defined $sortmeta && $groupsize > 1) { print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n"; print $out " sortmeta option will be ignored\n\n"; $sortmeta = undef; } # dynamically load 'docsave' module so it can pick up on a collection # specific docsave.pm is specified. unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); require docsave; # get the list of plugins for this collection and set any options that # were specified in the collect.cfg (all import.pl options except # -collectdir, -out and -faillog may be specified in the collect.cfg (these # options must be known before we read the collect.cfg)) my $plugins = []; $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg"); if (-e $configfilename) { $collectcfg = &colcfg::read_collect_cfg ($configfilename); if (defined $collectcfg->{'plugin'}) { $plugins = $collectcfg->{'plugin'}; } if ($verbosity !~ /\d+/) { if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) { $verbosity = $collectcfg->{'verbosity'}; } else { $verbosity = 2; # the default } } if (defined $collectcfg->{'importdir'} && $importdir eq "") { $importdir = $collectcfg->{'importdir'}; } if (defined $collectcfg->{'archivedir'} && $archivedir eq "") { $archivedir = $collectcfg->{'archivedir'}; } if (defined $collectcfg->{'removeold'}) { if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) { $removeold = 1; } if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) { $removeold = 0; } } if (defined $collectcfg->{'keepold'}) { if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) { $removeold = 1; } } if (defined $collectcfg->{'gzip'} && !$gzip) { if ($collectcfg->{'gzip'} =~ /^true$/i) { $gzip = 1; } } if ($maxdocs !~ /\-?\d+/) { if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) { $maxdocs = $collectcfg->{'maxdocs'}; } else { $maxdocs = -1; # the default } } if ($groupsize == 1) { if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) { $groupsize = $collectcfg->{'groupsize'}; } } if ($OIDtype !~ /^(hash|incremental)$/) { if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) { $OIDtype = $collectcfg->{'OIDtype'}; } else { $OIDtype = "hash"; # the default } } if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") { $sortmeta = $collectcfg->{'sortmeta'}; } if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) { $debug = 1; } } else { die "Couldn't find the configuration file $configfilename\n"; } # fill in the default import and archives directories if none # were supplied, turn all \ into / and remove trailing / $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; $importdir =~ s/[\\\/]+/\//g; $importdir =~ s/\/$//; $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq ""; $archivedir =~ s/[\\\/]+/\//g; $archivedir =~ s/\/$//; # load all the plugins $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillogname); if (scalar(@$pluginfo) == 0) { print $out "No plugins were loaded.\n"; die "\n"; } # remove the old contents of the archives directory if needed if ($removeold && -e $archivedir) { print $out "Removing current contents of the archives directory\n"; sleep(3); # just in case... &util::rm_r ($archivedir); } # read the archive information file if (!$debug) { $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf"); $archive_info = new arcinfo (); $archive_info->load_info ($archive_info_filename); # create a docsave object to process the documents $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out); $processor->setarchivedir ($archivedir); $processor->set_sortmeta ($sortmeta) if defined $sortmeta; $processor->set_OIDtype ($OIDtype); } else { $processor = new docprint (); } &plugin::begin($pluginfo, $importdir, $processor, $maxdocs); # process the import directory &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs); &plugin::end($pluginfo, $processor); # write out the archive information file if (!$debug) { $processor->close_file_output() if $groupsize > 1; $archive_info->save_info($archive_info_filename); } close OUT if $close_out; }