#!/usr/bin/perl -w ########################################################################### # # import.pl -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This program will import a number of files into a particular collection package import; BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify"); } use arcinfo; use colcfg; use plugin; use docprint; use util; use parsargv; use FileHandle; use printusage; my $oidtype_list = [ { 'name' => "hash", 'desc' => "Hashes the contents of the file. Document identifier will be the same every time the collection is imported." }, { 'name' => "incremental", 'desc' => "A simple document count that is significantly faster than \"hash\". It is not guaranteed to always assign the same identifier to a given document though and does not allow further documents to be added to existing xml archives." } ]; my $arguments = [ { 'name' => "archivedir", 'desc' => "Where the converted material ends up.", 'type' => "string", 'reqd' => "no" }, { 'name' => "collectdir", 'desc' => "Collection directory.", 'type' => "string", 'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"), 'reqd' => "no" }, { 'name' => "debug", 'desc' => "Print imported text to STDOUT.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "faillog", 'desc' => "Fail log filename. This log receives the filenames of any files which fail to be processed.", 'type' => "string", 'deft' => &util::filename_cat("", "colname", "etc", "fail.log"), 'reqd' => "no" }, { 'name' => "groupsize", 'desc' => "Number of import documents to group into one XML file.", 'type' => "int", 'deft' => "1", 'reqd' => "no" }, { 'name' => "gzip", 'desc' => "Use gzip to compress resulting xml documents (don't forget to include ZIPPlug in your plugin list when building from compressed documents).", 'type' => "flag", 'reqd' => "no" }, { 'name' => "importdir", 'desc' => "Where the original material lives.", 'type' => "string", 'reqd' => "no" }, { 'name' => "keepold", 'desc' => "Will not destroy the current contents of the archives directory (the default).", 'type' => "flag", 'reqd' => "no" }, { 'name' => "maxdocs", 'desc' => "Maximum number of documents to import.", 'type' => "int", 'reqd' => "no" }, { 'name' => "OIDtype", 'desc' => "The method to use when generating unique identifiers for each document.", 'type' => "enum", 'list' => $oidtype_list, 'deft' => "hash", 'reqd' => "no" }, { 'name' => "out", 'desc' => "Filename or handle to print output status to.", 'type' => "string", 'deft' => "STDERR", 'reqd' => "no" }, { 'name' => "removeold", 'desc' => "Will remove the old contents of the archives directory -- use with care.", 'type' => "flag", 'reqd' => "no" }, { 'name' => "sortmeta", 'desc' => "Sort documents alphabetically by metadata for building. This will be disabled if groupsize > 1.", 'type' => "string", 'reqd' => "no" }, { 'name' => "statsfile", 'desc' => "Filename or handle to print import statistics to.", 'type' => "string", 'deft' => "STDERR", 'reqd' => "no" }, { 'name' => "verbosity", 'desc' => "0=none, 3=lots", 'type' => "int", 'deft' => "2", 'reqd' => "no" } ]; my $options = { 'name' => "import.pl", 'desc' => "PERL script used to import files into a GML format ready for building.", 'args' => $arguments }; sub print_xml_usage { &PrintUsage::print_xml_header(); print STDERR "\n"; print STDERR " $options->{'name'}\n"; print STDERR " $options->{'desc'}\n"; print STDERR " \n"; if (defined($options->{'args'})) { &PrintUsage::print_options_xml($options->{'args'}); } print STDERR " \n"; print STDERR "\n"; } sub print_txt_usage { local $programname = $options->{'name'}; local $programargs = $options->{'args'}; # Find the length of the longest option string local $descoffset = 0; if (defined($programargs)) { $descoffset = &PrintUsage::find_longest_option_string($programargs); } # Produce the usage information using the data structure above print STDERR " usage: $programname [options] collection-name\n\n"; # Display the program options, if there are some if (defined($programargs)) { # Calculate the column offset of the option descriptions local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions print STDERR " options:\n"; # Display the program options &PrintUsage::print_options_txt($programargs, $optiondescoffset); } } # sub print_usage { # print STDOUT "\n"; # print STDOUT "import.pl: Converts documents in collections -importdir directory into\n"; # print STDOUT " xml documents which are written to the -archivedir directory.\n\n"; # print STDOUT " usage: $0 [options] collection-name\n\n"; # print STDOUT " options:\n"; # print STDOUT " -verbosity number 0=none, 3=lots\n"; # print STDOUT " -importdir directory Where the original material lives\n"; # print STDOUT " -archivedir directory Where the converted material ends up\n"; # print STDOUT " -keepold Will not destroy the current contents of the\n"; # print STDOUT " archives directory (the default)\n"; # print STDOUT " -removeold Will remove the old contents of the archives\n"; # print STDOUT " directory -- use with care\n"; # print STDOUT " -gzip Use gzip to compress resulting xml documents\n"; # print STDOUT " (don't forget to include ZIPPlug in your plugin\n"; # print STDOUT " list when building from compressed documents)\n"; # print STDOUT " -maxdocs number Maximum number of documents to import\n"; # print STDOUT " -groupsize number Number of import documents to group into one XML file\n"; # print STDOUT " -OIDtype hash|incremental The method to use when generating unique\n"; # print STDOUT " identifiers for each document. \"hash\" (the\n"; # print STDOUT " default) hashes the contents of the file and so\n"; # print STDOUT " will be the same every time the collection is\n"; # print STDOUT " imported. \"incremental\" is a simple document\n"; # print STDOUT " count and so will be significantly faster than\n"; # print STDOUT " \"hash\". It is not guaranteed to always assign\n"; # print STDOUT " the same identifier to a given document though\n"; # print STDOUT " and does not allow further documents to be added\n"; # print STDOUT " to existing xml archives\n"; # print STDOUT " -sortmeta metadata Sort documents alphabetically by metadata for\n"; # print STDOUT " building. This will be disabled if groupsize > 1\n"; # print STDOUT " -debug Print imported text to STDOUT\n"; # print STDOUT " -collectdir directory Collection directory (defaults to " . # &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n"; # print STDOUT " -out name Filename or handle to print output status to.\n"; # print STDOUT " The default is STDERR\n"; # print STDOUT " -statsfile name Filename or handle to print import statistics to.\n"; # print STDOUT " The default is STDERR\n"; # print STDOUT " -faillog name Fail log filename. This log receives the filenames\n"; # print STDOUT " of any files which fail to be processed (defaults\n"; # print STDOUT " to " . # &util::filename_cat("", "colname", "etc", "fail.log") . ")\n"; # print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]"; # print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i; # } &main(); sub main { my ($verbosity, $importdir, $archivedir, $keepold, $removeold, $gzip, $groupsize, $OIDtype, $debug, $maxdocs, $collection, $configfilename, $collectcfg, $pluginfo, $sortmeta, $archive_info_filename, $statsfile, $archive_info, $processor, $out, $faillog, $collectdir); # ***** 11-04-03 - John Thompson ***** my $xml = 0; # ************************************ # note that no defaults are passed for most options as they're set # later (after we check the collect.cfg file) if (!parsargv::parse(\@ARGV, 'verbosity/\d+/', \$verbosity, 'importdir/.*/', \$importdir, 'archivedir/.*/', \$archivedir, 'keepold', \$keepold, 'removeold', \$removeold, 'gzip', \$gzip, 'groupsize/\d+/1', \$groupsize, 'OIDtype/^(hash|incremental)$/', \$OIDtype, 'sortmeta/.*/', \$sortmeta, 'debug', \$debug, 'maxdocs/^\-?\d+/', \$maxdocs, 'collectdir/.*/', \$collectdir, 'out/.*/STDERR', \$out, 'statsfile/.*/STDERR', \$statsfile, 'faillog/.*/', \$faillog, q^xml^, \$xml)) { &print_txt_usage(); die "\n"; } if ($xml) { &print_xml_usage(); die "\n"; } my $close_out = 0; if ($out !~ /^(STDERR|STDOUT)$/i) { open (OUT, ">$out") || die "Couldn't open output file $out\n"; $out = 'import::OUT'; $close_out = 1; } $out->autoflush(1); # set removeold to false if it has been defined $removeold = 0 if ($keepold); # get and check the collection name if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") { &print_txt_usage(); die "\n"; } if ($faillog eq "") { $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log"); } open (FAILLOG, ">$faillog") || die "Couldn't open fail log $faillog\n"; my $faillogname = $faillog; $faillog = 'import::FAILLOG'; $faillog->autoflush(1); # check sortmeta $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/; if (defined $sortmeta && $groupsize > 1) { print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n"; print $out " sortmeta option will be ignored\n\n"; $sortmeta = undef; } # dynamically load 'docsave' module so it can pick up on a collection # specific docsave.pm is specified. unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); require docsave; # get the list of plugins for this collection and set any options that # were specified in the collect.cfg (all import.pl options except # -collectdir, -out and -faillog may be specified in the collect.cfg (these # options must be known before we read the collect.cfg)) my $plugins = []; $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg"); if (-e $configfilename) { $collectcfg = &colcfg::read_collect_cfg ($configfilename); if (defined $collectcfg->{'plugin'}) { $plugins = $collectcfg->{'plugin'}; } if ($verbosity !~ /\d+/) { if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) { $verbosity = $collectcfg->{'verbosity'}; } else { $verbosity = 2; # the default } } if (defined $collectcfg->{'importdir'} && $importdir eq "") { $importdir = $collectcfg->{'importdir'}; } if (defined $collectcfg->{'archivedir'} && $archivedir eq "") { $archivedir = $collectcfg->{'archivedir'}; } if (defined $collectcfg->{'removeold'}) { if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) { $removeold = 1; } if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) { $removeold = 0; } } if (defined $collectcfg->{'keepold'}) { if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) { $removeold = 1; } } if (defined $collectcfg->{'gzip'} && !$gzip) { if ($collectcfg->{'gzip'} =~ /^true$/i) { $gzip = 1; } } if ($maxdocs !~ /\-?\d+/) { if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) { $maxdocs = $collectcfg->{'maxdocs'}; } else { $maxdocs = -1; # the default } } if ($groupsize == 1) { if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) { $groupsize = $collectcfg->{'groupsize'}; } } if ($OIDtype !~ /^(hash|incremental)$/) { if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) { $OIDtype = $collectcfg->{'OIDtype'}; } else { $OIDtype = "hash"; # the default } } if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") { $sortmeta = $collectcfg->{'sortmeta'}; } if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) { $debug = 1; } } else { die "Couldn't find the configuration file $configfilename\n"; } # fill in the default import and archives directories if none # were supplied, turn all \ into / and remove trailing / $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; $importdir =~ s/[\\\/]+/\//g; $importdir =~ s/\/$//; $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq ""; $archivedir =~ s/[\\\/]+/\//g; $archivedir =~ s/\/$//; # load all the plugins $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog); if (scalar(@$pluginfo) == 0) { print $out "No plugins were loaded.\n"; die "\n"; } # remove the old contents of the archives directory if needed if ($removeold && -e $archivedir) { print $out "Removing current contents of the archives directory\n"; sleep(3); # just in case... &util::rm_r ($archivedir); } # read the archive information file if (!$debug) { $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf"); $archive_info = new arcinfo (); $archive_info->load_info ($archive_info_filename); # create a docsave object to process the documents $processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out); $processor->setarchivedir ($archivedir); $processor->set_sortmeta ($sortmeta) if defined $sortmeta; $processor->set_OIDtype ($OIDtype); } else { $processor = new docprint (); } &plugin::begin($pluginfo, $importdir, $processor, $maxdocs); # process the import directory &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs); &plugin::end($pluginfo, $processor); # write out the archive information file if (!$debug) { $processor->close_file_output() if $groupsize > 1; $archive_info->save_info($archive_info_filename); } # write out import stats my $close_stats = 0; if ($statsfile !~ /^(STDERR|STDOUT)$/i) { if (open (STATS, ">$statsfile")) { $statsfile = 'import::STATS'; $close_stats = 1; } else { print $out "WARNING: couldn't open stats file $statsfile\n"; print $out " will print stats to STDERR instead\n"; $statsfile = 'STDERR'; } } print $out "\n"; print $out "*********************************************\n"; print $out "Import Complete\n"; print $out "*********************************************\n"; &plugin::write_stats($pluginfo, $statsfile, $faillogname); if ($close_stats) { close STATS; } close OUT if $close_out; close FAILLOG; }