#!/usr/bin/perl -w

###########################################################################
#
# import.pl --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# This program will import a number of files into a particular collection

package import;

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
}

use arcinfo;
use colcfg;
use plugin;
use docprint;
use util;
use parsargv;
use FileHandle;
use printusage;

my $oidtype_list = 
    [ { 'name' => "hash",
        'desc' => "Hashes the contents of the file. Document identifier will be the same every time the collection is imported." },
      { 'name' => "incremental",
        'desc' => "A simple document count that is significantly faster than \"hash\". It is not guaranteed to always assign the same identifier to a given document though and does not allow further documents to be added to existing xml archives." } ];

my $arguments = 
    [ { 'name' => "archivedir",
	'desc' => "Where the converted material ends up.",
	'type' => "string",
	'reqd' => "no" },
      { 'name' => "collectdir",
	'desc' => "Collection directory.",
	'type' => "string",
	'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
	'reqd' => "no" },
      { 'name' => "debug",
	'desc' => "Print imported text to STDOUT.",
	'type' => "flag",
	'reqd' => "no" },
      { 'name' => "faillog",
	'desc' => "Fail log filename. This log receives the filenames of any files which fail to be processed.",
	'type' => "string",
	'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
	'reqd' => "no" },
      { 'name' => "groupsize",
	'desc' => "Number of import documents to group into one XML file.",
	'type' => "int",
	'deft' => "1",
	'reqd' => "no" },
      { 'name' => "gzip",
	'desc' => "Use gzip to compress resulting xml documents (don't forget to include ZIPPlug in your plugin list when building from compressed documents).",
	'type' => "flag",
	'reqd' => "no" },
      { 'name' => "importdir",
	'desc' => "Where the original material lives.",
	'type' => "string",
	'reqd' => "no" },
      { 'name' => "keepold",
	'desc' => "Will not destroy the current contents of the archives directory (the default).",
	'type' => "flag",
	'reqd' => "no" },
      { 'name' => "maxdocs",
	'desc' => "Maximum number of documents to import.",
	'type' => "int",
	'reqd' => "no" },
      { 'name' => "OIDtype",
	'desc' => "The method to use when generating unique identifiers for each document.",
	'type' => "enum",
	'list' => $oidtype_list,
	'deft' => "hash",
	'reqd' => "no" },
      { 'name' => "out",
	'desc' => "Filename or handle to print output status to.",
	'type' => "string",
	'deft' => "STDERR",
	'reqd' => "no" },
      { 'name' => "removeold",
	'desc' => "Will remove the old contents of the archives directory -- use with care.",
	'type' => "flag",
	'reqd' => "no" },
      { 'name' => "sortmeta",
	'desc' => "Sort documents alphabetically by metadata for building. This will be disabled if groupsize > 1.",
	'type' => "string",
	'reqd' => "no" },
      { 'name' => "statsfile",
	'desc' => "Filename or handle to print import statistics to.",
	'type' => "string",
	'deft' => "STDERR",
	'reqd' => "no" },
      { 'name' => "verbosity",
	'desc' => "0=none, 3=lots",
	'type' => "int",
	'deft' => "2",
	'reqd' => "no" } ];

my $options = { 'name' => "import.pl",
		'desc' => "PERL script used to import files into a GML format ready for building.",
		'args' => $arguments };


sub print_xml_usage
{
    &PrintUsage::print_xml_header();

    print STDERR "<Info>\n";
    print STDERR "  <Name>$options->{'name'}</Name>\n";
    print STDERR "  <Desc>$options->{'desc'}</Desc>\n";
    print STDERR "  <Arguments>\n";
    if (defined($options->{'args'})) {
	&PrintUsage::print_options_xml($options->{'args'});
    }
    print STDERR "  </Arguments>\n";
    print STDERR "</Info>\n";
}


sub print_txt_usage
{
    local $programname = $options->{'name'};
    local $programargs = $options->{'args'};

    # Find the length of the longest option string
    local $descoffset = 0;
    if (defined($programargs)) {
	$descoffset = &PrintUsage::find_longest_option_string($programargs);
    }

    # Produce the usage information using the data structure above
    print STDERR " usage: $programname [options] collection-name\n\n";

    # Display the program options, if there are some
    if (defined($programargs)) {
	# Calculate the column offset of the option descriptions
	local $optiondescoffset = $descoffset + 2;  # 2 spaces between options & descriptions

	print STDERR " options:\n";

	# Display the program options
	&PrintUsage::print_options_txt($programargs, $optiondescoffset);
    }
}


#  sub print_usage {
#      print STDOUT "\n";
#      print STDOUT "import.pl: Converts documents in collections -importdir directory into\n";
#      print STDOUT "           xml documents which are written to the -archivedir directory.\n\n";
#      print STDOUT "  usage: $0 [options] collection-name\n\n";
#      print STDOUT "  options:\n";
#      print STDOUT "   -verbosity number      0=none, 3=lots\n";
#      print STDOUT "   -importdir directory   Where the original material lives\n";
#      print STDOUT "   -archivedir directory  Where the converted material ends up\n";
#      print STDOUT "   -keepold               Will not destroy the current contents of the\n";
#      print STDOUT "                          archives directory (the default)\n";
#      print STDOUT "   -removeold             Will remove the old contents of the archives\n";
#      print STDOUT "                          directory -- use with care\n";
#      print STDOUT "   -gzip                  Use gzip to compress resulting xml documents\n";
#      print STDOUT "                          (don't forget to include ZIPPlug in your plugin\n";
#      print STDOUT "                          list when building from compressed documents)\n";
#      print STDOUT "   -maxdocs number        Maximum number of documents to import\n";
#      print STDOUT "   -groupsize number      Number of import documents to group into one XML file\n";
#      print STDOUT "   -OIDtype hash|incremental The method to use when generating unique\n";
#      print STDOUT "                          identifiers for each document. \"hash\" (the\n";
#      print STDOUT "                          default) hashes the contents of the file and so\n";
#      print STDOUT "                          will be the same every time the collection is\n";
#      print STDOUT "                          imported. \"incremental\" is a simple document\n";
#      print STDOUT "                          count and so will be significantly faster than\n";
#      print STDOUT "                          \"hash\". It is not guaranteed to always assign\n";
#      print STDOUT "                          the same identifier to a given document though\n";
#      print STDOUT "                          and does not allow further documents to be added\n";
#      print STDOUT "                          to existing xml archives\n";
#      print STDOUT "   -sortmeta metadata     Sort documents alphabetically by metadata for\n";
#      print STDOUT "                          building. This will be disabled if groupsize > 1\n";
#      print STDOUT "   -debug                 Print imported text to STDOUT\n";
#      print STDOUT "   -collectdir directory  Collection directory (defaults to " .
#  	&util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
#      print STDOUT "   -out name              Filename or handle to print output status to.\n";
#      print STDOUT "                          The default is STDERR\n";
#      print STDOUT "   -statsfile name        Filename or handle to print import statistics to.\n";
#      print STDOUT "                          The default is STDERR\n";
#      print STDOUT "   -faillog name          Fail log filename. This log receives the filenames\n";
#      print STDOUT "                          of any files which fail to be processed (defaults\n";
#      print STDOUT "                          to " . 
#  	&util::filename_cat("<collectdir>", "colname", "etc", "fail.log") . ")\n";
#      print STDOUT "  [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
#      print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
#  }

&main();

sub main {
    my ($verbosity, $importdir, $archivedir, $keepold, 
	$removeold, $gzip, $groupsize, $OIDtype, $debug, 
	$maxdocs, $collection, $configfilename, $collectcfg, 
	$pluginfo, $sortmeta, $archive_info_filename, $statsfile,
	$archive_info, $processor, $out, $faillog, $collectdir);

    # ***** 11-04-03 - John Thompson *****
    my $xml = 0;
    # ************************************

    # note that no defaults are passed for most options as they're set
    # later (after we check the collect.cfg file)
    if (!parsargv::parse(\@ARGV,
			 'verbosity/\d+/', \$verbosity,
			 'importdir/.*/', \$importdir,
			 'archivedir/.*/', \$archivedir, 
			 'keepold', \$keepold, 
			 'removeold', \$removeold, 
			 'gzip', \$gzip, 
			 'groupsize/\d+/1', \$groupsize, 
			 'OIDtype/^(hash|incremental)$/', \$OIDtype,
			 'sortmeta/.*/', \$sortmeta, 
			 'debug', \$debug,
			 'maxdocs/^\-?\d+/', \$maxdocs,
			 'collectdir/.*/', \$collectdir,
			 'out/.*/STDERR', \$out,
			 'statsfile/.*/STDERR', \$statsfile,
			 'faillog/.*/', \$faillog,
			 q^xml^, \$xml)) {
	&print_txt_usage();
	die "\n";
    }

    if ($xml) {
        &print_xml_usage();
	die "\n";
    }

    my $close_out = 0;
    if ($out !~ /^(STDERR|STDOUT)$/i) {
	open (OUT, ">$out") || die "Couldn't open output file $out\n";
	$out = 'import::OUT';
	$close_out = 1;
    }
    $out->autoflush(1);

    # set removeold to false if it has been defined
    $removeold = 0 if ($keepold);

    # get and check the collection name
    if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
	&print_txt_usage();
	die "\n";
    }

    if ($faillog eq "") {
	$faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    }
    open (FAILLOG, ">$faillog") || die "Couldn't open fail log $faillog\n";
    my $faillogname = $faillog;
    $faillog = 'import::FAILLOG';
    $faillog->autoflush(1);

    # check sortmeta
    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
    if (defined $sortmeta && $groupsize > 1) {
	print $out "WARNING: import.pl cannot sort documents when groupsize > 1\n";
	print $out "         sortmeta option will be ignored\n\n";
	$sortmeta = undef;
    }

    # dynamically load 'docsave' module so it can pick up on a collection
    # specific docsave.pm is specified.

    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    require docsave;


    # get the list of plugins for this collection and set any options that
    # were specified in the collect.cfg (all import.pl options except
    # -collectdir, -out and -faillog may be specified in the collect.cfg (these
    # options must be known before we read the collect.cfg))
    my $plugins = [];
    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
    if (-e $configfilename) {
	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
	if (defined $collectcfg->{'plugin'}) {
	    $plugins = $collectcfg->{'plugin'};
	}

	if ($verbosity !~ /\d+/) {
	    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
		$verbosity = $collectcfg->{'verbosity'};
	    } else {
		$verbosity = 2; # the default
	    }
	}
	if (defined $collectcfg->{'importdir'} && $importdir eq "") {
	    $importdir = $collectcfg->{'importdir'};
	}
	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
	    $archivedir = $collectcfg->{'archivedir'};
	}
	if (defined $collectcfg->{'removeold'}) {
	    if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
		$removeold = 1;
	    }
	    if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
		$removeold = 0;
	    }
	}
	if (defined $collectcfg->{'keepold'}) {
	    if ($collectcfg->{'keepold'} =~ /^false$/i && !$keepold) {
		$removeold = 1;
	    }
	}
	if (defined $collectcfg->{'gzip'} && !$gzip) {
	    if ($collectcfg->{'gzip'} =~ /^true$/i) {
		$gzip = 1;
	    }
	}
	if ($maxdocs !~ /\-?\d+/) {
	    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
		$maxdocs = $collectcfg->{'maxdocs'};
	    } else {
		$maxdocs = -1; # the default
	    }
	}
	if ($groupsize == 1) {
	    if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
		$groupsize = $collectcfg->{'groupsize'};
	    }
	}
	if ($OIDtype !~ /^(hash|incremental)$/) {
	    if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
		$OIDtype = $collectcfg->{'OIDtype'};
	    } else {
		$OIDtype = "hash"; # the default
	    }
	}
	if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
	    $sortmeta = $collectcfg->{'sortmeta'};
	}
	if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
	    $debug = 1;
	}

    } else {
	die "Couldn't find the configuration file $configfilename\n";
    }
    
    # fill in the default import and archives directories if none
    # were supplied, turn all \ into / and remove trailing /
    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    $importdir =~ s/[\\\/]+/\//g;
    $importdir =~ s/\/$//;
    $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
    $archivedir =~ s/[\\\/]+/\//g;
    $archivedir =~ s/\/$//;

    # load all the plugins
    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog);
    if (scalar(@$pluginfo) == 0) {
	print $out "No plugins were loaded.\n";
	die "\n";
    }
	
    # remove the old contents of the archives directory if needed
    if ($removeold && -e $archivedir) {
	print $out "Removing current contents of the archives directory\n";
	sleep(3); # just in case...
	&util::rm_r ($archivedir);
    }
    
    # read the archive information file
    if (!$debug) {
	$archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
	$archive_info = new arcinfo ();
	$archive_info->load_info ($archive_info_filename);

	# create a docsave object to process the documents
	$processor = new docsave ($collection, $archive_info, $verbosity, $gzip, $groupsize, $out);
	$processor->setarchivedir ($archivedir);
	$processor->set_sortmeta ($sortmeta) if defined $sortmeta;
	$processor->set_OIDtype ($OIDtype);
    } else {
	$processor = new docprint ();
    }

    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);

    # process the import directory
    &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs);
    
    &plugin::end($pluginfo, $processor);

    # write out the archive information file
    if (!$debug) {
	$processor->close_file_output() if $groupsize > 1;
	$archive_info->save_info($archive_info_filename);
    }
    
    # write out import stats
    my $close_stats = 0;
    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
	if (open (STATS, ">$statsfile")) {
	    $statsfile = 'import::STATS';
	    $close_stats = 1;
	} else {
	    print $out "WARNING: couldn't open stats file $statsfile\n";
	    print $out "         will print stats to STDERR instead\n";
	    $statsfile = 'STDERR';
	}
    }

    print $out "\n";
    print $out "*********************************************\n";
    print $out "Import Complete\n";
    print $out "*********************************************\n";

    &plugin::write_stats($pluginfo, $statsfile, $faillogname);
    if ($close_stats) {
	close STATS;
    }

    close OUT if $close_out;
    close FAILLOG;
}