###########################################################################
#
# mgppbuilder.pm -- MGBuilder object
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package mgppbuilder;

use classify;
use cfgread;
use colcfg;
use plugin;
use util;
use FileHandle;


BEGIN {
    # set autoflush on for STDERR and STDOUT so that mgpp
    # doesn't get out of sync with plugins
    STDOUT->autoflush(1);
    STDERR->autoflush(1);
}

END {
    STDOUT->autoflush(0);
    STDERR->autoflush(0);
}

$maxdocsize = 12000;

%level_map = ('document'=>'Doc',
	      'section'=>'Sec',
	      'paragraph'=>'Para',
	      'Doc'=>'_textdocument_',
	      'Sec'=>'_textsection_',
	      'Para'=>'_textparagraph_');

#$doc_level = "Doc";
#$sec_level = "Sec";
#$para_level = "Para";

%wanted_index_files = ('td'=>1,
		       't'=>1,
		       'tl'=>1,
		       'ti'=>1,
		       'idb'=>1,
		       'ib1'=>1,
		       'ib2'=>1,
		       'ib3'=>1,
		       'i'=>1,
		       'il'=>1,
		       'w'=>1,
		       'wa'=>1);

# change this so a user can add their own ones in via a file or cfg
#add AND, OR, NOT NEAR to this list - these cannot be used as field names
#also add the level names (Doc, Sec, Para)
%static_indexfield_map = ('Title'=>'TI',
			  'TI'=>1,
			  'Subject'=>'SU',
			  'SU'=>1,
			  'Creator'=>'CR',
			  'CR'=>1,
			  'Organization'=>'ORG',
			  'ORG'=>1,
			  'Source'=>'SO',
			  'SO'=>1,
			  'Howto'=>'HT',
			  'HT'=>1,
			  'ItemTitle'=>'IT',
			  'IT'=>1,
			  'ProgNumber'=>'PN',
			  'PN'=>1,
			  'People'=>'PE',
			  'PE'=>1,
			  'Coverage'=>'CO',
			  'CO'=>1,
			  'allfields'=>'ZZ',
			  'ZZ'=>1,
			  'text'=>'TX',
			  'TX'=>1,
			  'AND'=>1,
			  'OR'=>1,
			  'NOT'=>1,
			  'NEAR'=>1,
			  'Doc'=>1,
			  'Sec'=>1,
			  'Para'=>1);

sub new {
    my ($class, $collection, $source_dir, $build_dir, $verbosity, 
	$maxdocs, $debug, $keepold, $allclassifications, 
	$outhandle, $no_text, $gli) = @_;

    $outhandle = STDERR unless defined $outhandle;
    $no_text = 0 unless defined $no_text;
    
    # create an mgppbuilder object
    my $self = bless {'collection'=>$collection,
		      'source_dir'=>$source_dir,
		      'build_dir'=>$build_dir,
		      'verbosity'=>$verbosity,
		      'maxdocs'=>$maxdocs,
		      'debug'=>$debug,
		      'keepold'=>$keepold,
		      'allclassifications'=>$allclassifications,
		      'outhandle'=>$outhandle,
		      'no_text'=>$no_text,
		      'notbuilt'=>{},    # indexes not built
		      'indexfieldmap'=>\%static_indexfield_map,
		      'gli'=>$gli
		  }, $class;

    $self->{'gli'} = 0 unless defined $self->{'gli'};

    # read in the collection configuration file
    my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
    if (!-e $colcfgname) {
	die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
    }
    $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);

    # sort out the indexes
    #indexes are specified with spaces, but we put them into one index
    my $indexes = $self->{'collect_cfg'}->{'indexes'};
    $self->{'collect_cfg'}->{'indexes'} = [];
    push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
	 
    
    # sort out subcollection indexes
    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
	    foreach $index (@$indexes) {
		push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
	    }
	}
    }

    # sort out language subindexes
    if (defined $self->{'collect_cfg'}->{'languages'}) {
	my $indexes = $self->{'collect_cfg'}->{'indexes'};
	$self->{'collect_cfg'}->{'indexes'} = [];
	foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
	    foreach $index (@$indexes) {
		if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
		    push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
		}
		else { # add in an empty subcollection field
		    push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
		   
		}		
	    }
	}
    }

    # make sure that the same index isn't specified more than once
    my %tmphash = ();
    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
    $self->{'collect_cfg'}->{'indexes'} = [];
    foreach my $i (@tmparray) {
	if (!defined ($tmphash{$i})) {
	    push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
	    $tmphash{$i} = 1;
	}
    }


    # get the levels (Section, Paragraph) for indexing and compression
    $self->{'levels'} = {};
    $self->{'levelorder'} = ();
    if (defined $self->{'collect_cfg'}->{'levels'}) {
        foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
	    $level =~ tr/A-Z/a-z/;
            $self->{'levels'}->{$level} = 1;
	    push (@{$self->{'levelorder'}}, $level);
        }
    } else { # default to document
	$self->{'levels'}->{'document'} = 1;
	push (@{$self->{'levelorder'}}, 'document');
    }
    
    $self->{'doc_level'} = "document";
    if (! $self->{'levels'}->{'document'}) {
	if ($self->{'levels'}->{'section'}) {
	    $self->{'doc_level'} = "section";
	} else {
	    die "you must have either document or section level specified!!\n";
	}
    }
    print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
    # get the list of plugins for this collection

    #build up the extra global options for the plugins
    my @global_opts = ();
    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
	push @global_opts, "-separate_cjk";
    }

    my $plugins = [];
    if (defined $self->{'collect_cfg'}->{'plugin'}) {
	$plugins = $self->{'collect_cfg'}->{'plugin'};
    }
    
    # load all the plugins
    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
    if (scalar(@{$self->{'pluginfo'}}) == 0) {
	print $outhandle "No plugins were loaded.\n";
	die "\n";
    }

    # get the list of classifiers for this collection
    my $classifiers = [];
    if (defined $self->{'collect_cfg'}->{'classify'}) {
	$classifiers = $self->{'collect_cfg'}->{'classify'};
    }
    
    # load all the classifiers
    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);

    # load up any dontgdbm fields
    $self->{'dontgdbm'} = {};
    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
	foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
	    $self->{'dontgdbm'}->{$dg} = 1;
	}
    }

    # load up the document processor for building
    # if a buildproc class has been created for this collection, use it
    # otherwise, use the mgpp buildproc
    my ($buildprocdir, $buildproctype);
    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
	$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
	$buildproctype = "${collection}buildproc";
    } else {
	$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
	$buildproctype = "mgppbuildproc";
    }
    require "$buildprocdir/$buildproctype.pm";

    eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
	 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
    die "$@" if $@;


    return $self;
}

sub init {
    my $self = shift (@_);

    if (!$self->{'debug'} && !$self->{'keepold'}) {
	# remove any old builds
	&util::rm_r($self->{'build_dir'});
	&util::mk_all_dir($self->{'build_dir'});
        
	# make the text directory
	my $textdir = "$self->{'build_dir'}/text";
	&util::mk_all_dir($textdir);
    }
}

sub set_strip_html {
    my $self = shift (@_);
    my ($strip) = @_;
    
    $self->{'strip_html'} = $strip;
    $self->{'buildproc'}->set_strip_html($strip);
}

sub compress_text {

    my $self = shift (@_);
    my ($textindex) = @_;

    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
    my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
    my $outhandle = $self->{'outhandle'};

    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));

    my $basefilename = "text/$self->{'collection'}";
    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);

   my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	$fulltextprefix =~ s@/@\\@g;
    } 
    else {
	$osextra = " -d /";
    }


    # define the section names and possibly the doc name for mgpasses
    # the compressor doesn't need to know about paragraphs - never want to 
    # retrieve them
    my $mgpp_passes_sections = "";
    my ($doc_level) = $self->{'doc_level'};
    $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " ";
    foreach $level (keys %{$self->{'levels'}}) {
	if ($level ne $doc_level && $level ne "paragraph") {
	    $mgpp_passes_sections .= "-K " . $level_map{$level} . " ";
	}
    }
	
    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
    print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};

    # collect the statistics for the text
    # -b $maxdocsize sets the maximum document size to be 12 meg
    print $outhandle "\n    collecting text statistics (mgpp_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
    print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};

    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	#print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n";
	if (!-e "$mgpp_passes_exe" || 
	    !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) {
	    print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($textindex);
    $self->{'buildproc'}->set_indexing_text (0);
    if ($self->{'no_text'}) {
	$self->{'buildproc'}->set_store_text(0);
    } else {
	$self->{'buildproc'}->set_store_text(1);
    }
    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($self->{'levels'});                      
    $self->{'buildproc'}->reset();
    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 
		   $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::end($self->{'pluginfo'});
    close (PIPEOUT);

    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    # create the compression dictionary
    # the compression dictionary is built by assuming the stats are from a seed
    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
    # and the resulting dictionary must be less than 5 meg with the most 
    # frequent words being put into the dictionary first (-2 -k 5120)
    # note: these options are left over from mg version
    if (!$self->{'debug'}) {
	print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
	print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
	if (!-e "$mgpp_compression_dict_exe") {
	    print STDERR "<FatalError name='NoRunMGCompress'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
	}
	system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");

	if (!$self->{'debug'}) {
	    #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n";
	    if (!-e "$mgpp_passes_exe" || 
		!open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) {
		print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
		die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
	    }
	}
    }
    else {
	print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
    }

    $self->{'buildproc'}->reset();
    # compress the text
    print $outhandle "\n    compressing the text (mgpp_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
    print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};

    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();
    print STDERR "</Stage>\n" if $self->{'gli'};
}

sub want_built {
    my $self = shift (@_);
    my ($index) = @_;

    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
	foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
	    if ($index =~ /^$checkstr$/) {
		#push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
		$self->{'notbuilt'}->{$index} = 1;
		return 0;
	    }
	}
    }

    return 1;
}

sub build_indexes {
    my $self = shift (@_);
    my ($indexname) = @_;
    my $outhandle = $self->{'outhandle'};

    my $indexes = [];
    if (defined $indexname && $indexname =~ /\w/) {
	push @$indexes, $indexname;
    } else {
	$indexes = $self->{'collect_cfg'}->{'indexes'};
    }

    # create the mapping between the index descriptions 
    # and their directory names (includes subcolls and langs)
    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);

    # build each of the indexes
    foreach $index (@$indexes) {
	if ($self->want_built($index)) {
	    print $outhandle "\n*** building index $index in subdirectory " . 
		"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
	    print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
	    $self->build_index($index);
	} else {
	    print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
	}
    }

    #define the final field lists
    $self->make_final_field_list();

}

# creates directory names for each of the index descriptions
sub create_index_mapping {
    my $self = shift (@_);
    my ($indexes) = @_;

    my %mapping = ();

    $mapping{'indexmaporder'} = [];
    $mapping{'subcollectionmaporder'} = [];
    $mapping{'languagemaporder'} = [];
    
    # dirnames is used to check for collisions. Start this off
    # with the manditory directory names
    my %dirnames = ('text'=>'text',
		    'extra'=>'extra');
    my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');

    foreach $index (@$indexes) {
	my ($fields, $subcollection, $languages) = split (":", $index);
	# the directory name starts with a processed version of index fields
	#my ($pindex) = $self->process_field($fields);
	#$pindex = lc ($pindex);
	# now we only ever have one index, and its called 'idx'
	$pindex = 'idx';
	
	# next comes a processed version of the subcollection if there is one.
	my $psub = $self->process_field ($subcollection);
	$psub = lc ($psub);

	# next comes a processed version of the language if there is one.
	my $plang = $self->process_field ($languages);
	$plang = lc ($plang);

	my $dirname = $pindex . $psub . $plang;

	# check to be sure all index names are unique
	while (defined ($dirnames{$dirname})) {
	    $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
	}

	$mapping{$index} = $dirname;

	# store the mapping orders as well as the maps
	# also put index, subcollection and language fields into the mapping thing - 
	# (the full index name (eg text:subcol:lang) is not used on
	# the query page) -these are used for collectionmeta later on
	if (!defined $mapping{'indexmap'}{"$fields"}) {
	    $mapping{'indexmap'}{"$fields"} = $pindex;
	    push (@{$mapping{'indexmaporder'}}, "$fields");
	    if (!defined $mapping{"$fields"}) {
		$mapping{"$fields"} = $pindex;
	    }	
	}
	if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
	    $mapping{'subcollectionmap'}{$subcollection} = $psub;
	    push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
	    $mapping{$subcollection} = $psub;
	}
	if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
	    $mapping{'languagemap'}{$languages} = $plang;
	    push (@{$mapping{'languagemaporder'}}, $languages);
	    $mapping{$languages} = $plang;
	}
	$dirnames{$dirname} = $index;
	$pnames{'index'}{$pindex} = "$fields";
	$pnames{'subcollection'}{$psub} = $subcollection;
	$pnames{'languages'}{$plang} = $languages;
    }

    return \%mapping;
}

# returns a processed version of a field.
# if the field has only one component the processed
# version will contain the first character and next consonant
# of that componant - otherwise it will contain the first 
# character of the first two components 
sub process_field {
    my $self = shift (@_);
    my ($field) = @_;
 
    return "" unless (defined ($field) && $field =~ /\w/);

    my @components = split /,/, $field;
    if (scalar @components >= 2) {
	splice (@components, 2);
	map {s/^(.).*$/$1/;} @components;
	return join("", @components);
    } else {
	my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
	($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
	return "$a$b";
    }
}

sub make_unique {
    my $self = shift (@_);
    my ($namehash, $index, $indexref, $subref, $langref) = @_;
    my ($fields, $subcollection, $languages) = split (":", $index);

    if ($namehash->{'index'}->{$$indexref} ne "$fields") {
	$self->get_next_version ($indexref);
    } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
	$self->get_next_version ($subref);
    } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
	$self->get_next_version ($langref);
    }
    return "$$indexref$$subref$$langref";
}	

sub get_next_version {
    my $self = shift (@_);
    my ($nameref) = @_;

    if ($$nameref =~ /(\d\d)$/) {
	my $num = $1; $num ++;
	$$nameref =~ s/\d\d$/$num/;
    } elsif ($$nameref =~ /(\d)$/) {
	my $num = $1;
	if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
	else {$num ++; $$nameref =~ s/\d$/$num/;}
    } else {
	$$nameref =~ s/.$/0/;
    }
}

sub build_index {
    my $self = shift (@_);
    my ($index) = @_;
    my $outhandle = $self->{'outhandle'};

    # get the full index directory path and make sure it exists
    my $indexdir = $self->{'index_mapping'}->{$index};
    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
    my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, 
					       $indexdir, 
					       $self->{'collection'});
    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", 
					       $self->{'collection'});

    # get any os specific stuff
    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";

    my $exe = &util::get_os_exe ();
    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");

    # define the section names for mgpasses
    # define the section names and possibly the doc name for mgpasses
    my $mgpp_passes_sections = "";
    my ($doc_level) = $self->{'doc_level'};
    $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." ";
    
    foreach $level (keys %{$self->{'levels'}}) {
	if ($level ne $doc_level) {
	    $mgpp_passes_sections .= "-K " . $level_map{$level}. " ";
	}
    }

    my $mgpp_perf_hash_build_exe = 
	&util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
    my $mgpp_weights_build_exe = 
	&util::filename_cat ($exedir, "mgpp_weights_build$exe");
    my $mgpp_invf_dict_exe = 
	&util::filename_cat ($exedir, "mgpp_invf_dict$exe");
    my $mgpp_stem_idx_exe =
	&util::filename_cat ($exedir, "mgpp_stem_idx$exe");

     my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	$fullindexprefix =~ s@/@\\@g;
    } else {
	$osextra = " -d /";
	if ($outhandle ne "STDERR") {
	    # so mgpp_passes doesn't print to stderr if we redirect output
	    $osextra .= " 2>/dev/null";
	}
    }
 
    # get the index expression if this index belongs
    # to a subcollection
    my $indexexparr = [];

    # there may be subcollection info, and language info. 
    my ($fields, $subcollection, $language) = split (":", $index);
    my @subcollections = ();
    @subcollections = split /,/, $subcollection if (defined $subcollection);

    foreach $subcollection (@subcollections) {
	if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
	    push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
	} 
    }
    
    # add expressions for languages if this index belongs to
    # a language subcollection - only put languages expressions for the 
    # ones we want in the index
    
    # this puts a separate Language/en entry in for each language in the list
    # is this what we want? 
    # should we just have one entry with Language/en,es/ ??
    my @languages = ();
    @languages = split /,/, $language if (defined $language);
    foreach $language (@languages) {
	my $not=0;
	if ($language =~ s/^\!//) {
	    $not = 1;
	}
	if ($not) {
	    push (@$indexexparr, "!Language/$language/");
	} else {
	    push (@$indexexparr, "Language/$language/");
	}
    }

    # Build index dictionary. Uses verbatim stem method
    print $outhandle "\n    creating index dictionary (mgpp_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$mgpp_passes_exe" || 
	    !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) {
	    print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }
	
    # set up the document processr
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($index, $indexexparr);
    $self->{'buildproc'}->set_indexing_text (1);
    $self->{'buildproc'}->set_store_text(1);
    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($self->{'levels'});                       
    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
    # we check on the .id file - index dictionary
    my $dict_file = "$fullindexprefix.id";
    if (!-e $dict_file) {
	print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n";
	print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
	$self->{'notbuilt'}->{$index}=1;
	return;
    }

    if (!$self->{'debug'}) {
	# create the perfect hash function
	if (!-e "$mgpp_perf_hash_build_exe") {
	    print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
	}
	system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");

	if (!-e "$mgpp_passes_exe" || 
	    !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) {
	    print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
	}
    }
    
    # invert the text
    print $outhandle "\n    inverting the text (mgpp_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
    print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
    $self->{'buildproc'}->reset();
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    $self->print_stats ();
    
    if (!$self->{'debug'}) {

	close ($handle);
	
	# create the weights file
	print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
	print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
	if (!-e "$mgpp_weights_build_exe") {
	    print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
	}
	system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra");

	# create 'on-disk' stemmed dictionary
	print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
	if (!-e "$mgpp_invf_dict_exe") {
	    print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
	}
	system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" );


	# creates stem index files for the various stemming methods
	print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
	print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
	if (!-e "$mgpp_stem_idx_exe") {
	    print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
	}
	system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
	system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
	system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
	
	# remove unwanted files
	my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
	opendir (DIR, $tmpdir) || die
	    "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
	foreach $file (readdir(DIR)) {
	    next if $file =~ /^\./;
	    my ($suffix) = $file =~ /\.([^\.]+)$/;
	    if (defined $suffix && !defined $wanted_index_files{$suffix}) {
		# delete it!
		print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
		#&util::rm (&util::filename_cat ($tmpdir, $file));
	    }
	}
	closedir (DIR);
    }
    print STDERR "</Stage>\n" if $self->{'gli'};
}   

sub make_infodatabase {
    my $self = shift (@_);
    my $outhandle = $self->{'outhandle'};


    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    &util::mk_all_dir ($textdir);
    &util::mk_all_dir ($assocdir);

    # get db name
    my $dbext = ".bdb";
    $dbext = ".ldb" if &util::is_little_endian();
    my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
    $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);

    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    my $exe = &util::get_os_exe ();
    my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");

    # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
    if (!defined $self->{'build_cfg'}) {
	$self->read_final_field_list();
    }
    print $outhandle "\n*** creating the info database and processing associated files\n" 
	if ($self->{'verbosity'} >= 1);
    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};

    # init all the classifiers
    &classify::init_classifiers ($self->{'classifiers'});

    # set up the document processor
    my ($handle);
    if ($self->{'debug'}) {
	$handle = STDOUT;
    } else {
	if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
	    print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
	    die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
	}
	$handle = mgppbuilder::PIPEOUT;
    }

    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('infodb');
    $self->{'buildproc'}->set_assocdir ($assocdir);
    $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    $self->{'buildproc'}->set_indexing_text (0);
    $self->{'buildproc'}->set_store_text(1);
    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});

    $self->{'buildproc'}->reset();

    # do the collection info
    print $handle "[collection]\n";
    
    # first do the collection meta stuff - everything without a dot
    my $collmetadefined = 0;
    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
	$collmetadefined = 1;
	foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
	    next if ($cmeta =~ /^\./); # for now, ignore ones with dots
	    my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
	    #write the entry to the file
	    print $handle $metadata_entry;
	    
	} # foreach collmeta key
    }
    #add the index field macros to [collection]
    # eg <TI>Title
    #    <SU>Subject
    # these now come from collection meta. if that is not defined, usses the metadata name
    $field_entry="";
    foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
	$shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
	next if $shortfield eq 1;
	
	# we need to check if some coll meta has been defined
	my $collmeta = ".$longfield";
	if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
	    $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
	    $field_entry .= $metadata_entry;
	} else { #use the metadata names, or the text macros for allfields and textonly
	    if ($longfield eq "allfields") {
		$field_entry .= "<$shortfield>_query:textallfields_\n";
	    } elsif ($longfield eq "text") {
		$field_entry .= "<$shortfield>_query:texttextonly_\n";
	    } else {
		$field_entry .= "<$shortfield>$longfield\n";
	    }
	}
    }
    print $handle $field_entry;
    
    # now add the level names
    $level_entry = "";
    foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
	my $collmeta = ".$level"; # based on the original specification
	$level =~ tr/A-Z/a-z/; # make it lower case
	my $levelid = $level_map{$level}; # find the actual value we used in the index
	if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
	    $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
	    $level_entry .= $metadata_entry;
	} else {
	    # use the default macro
	    $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n";
	}
    }
    print $handle $level_entry;
    
    # now add subcoll meta
    $subcoll_entry = "";
    foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
	if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) {
	    my $shortname = $self->{'index_mapping'}->{$subcoll};
	    $one_entry = $self->create_language_db_map(".$subcoll", $shortname);
	    $subcoll_entry .= $one_entry;
	} else {
	    $subcoll_entry .= "<$shortname>$subcoll\n";
	}
    }
    print $handle $subcoll_entry;
     # now add language meta
    $lang_entry = "";
    foreach $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
	if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) {
	    my $shortname = $self->{'index_mapping'}->{$lang};
	    $one_entry = $self->create_language_db_map(".$lang", $shortname);
	    $lang_entry .= $one_entry;
	} else {
	    $lang_entry .= "<$shortname>$lang\n";
	}
    }
    print $handle $lang_entry;
   #end the collection entry
    print $handle "\n" . ('-' x 70) . "\n";
    
    
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
		   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});

    # output classification information
    &classify::output_classify_info ($self->{'classifiers'}, $handle,
				     $self->{'allclassifications'}, 
				     $self->{'gli'});

    #output doclist
    my @doclist = $self->{'buildproc'}->get_doc_list();
    my $docs = join (";",@doclist);
    print $handle "[browselist]\n";
    print $handle "<hastxt>0\n";
    print $handle "<childtype>VList\n";
    print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
    print $handle "<thistype>Invisible\n";
    print $handle "<contains>$docs";
    print $handle "\n" . ('-' x 70) . "\n";
    close ($handle) if !$self->{'debug'};

    print STDERR "</Stage>\n" if $self->{'gli'};
}

sub create_language_db_map {
    my $self = shift (@_);
    my ($metaname, $mapname) = @_;
    my $outhandle =  $self->{'outhandle'};
    my $defaultfound=0;
    my $first=1;
    my $metadata_entry = "";
    my $default="";
    #iterate through the languages
    foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
	if ($first) {
	    $first=0;
	    #set the default default to the first entry
	    $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
	}
	if ($lang =~ /default/) {
	    $defaultfound=1;
	    #the default entry goes first
	    $metadata_entry = "<$mapname>" .
		$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry;
	}
	else {
	    my ($l) = $lang =~ /^\[l=(\w*)\]$/;
	    if ($l) {
		$metadata_entry .= "<$mapname:$l>" . 
		    $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n";

		# Use the English value as the default if no default is specified
		if ($l =~ /en/i) {
		    $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
		}
	    }
	}
    } #foreach lang
    #if we haven't found a default, put one in
    if (!$defaultfound) {
	$metadata_entry = "<$mapname>$default\n" . $metadata_entry;
    }
    return $metadata_entry;
    
}
sub collect_specific {
    my $self = shift (@_);
}

# at the end of building, we have an indexfieldmap with all teh mappings, plus
# some extras, and indexmap with any indexes in it that weren't specified in the index definition.
# we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition
# we store these in a build.cfg bit
sub make_final_field_list {
    my $self = shift (@_);
    
    $self->{'build_cfg'} = {};

    # store the indexfieldmap information
    my @indexfieldmap = ();
    my @indexfields = ();
    my $specifiedfields = {};
    my @specifiedfieldorder = ();
    # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again.
    foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
	# remove subcoll stuff
	my $parts = $field;
	$parts =~ s/:.*$//;
	my @fs = split(',', $parts);
	foreach $f(@fs) {
	    if (!defined $specifiedfields->{$f}) {
		$specifiedfields->{$f}=1;
		push (@specifiedfieldorder, "$f");
	    }
	}
    }
    
    #add all fields bit 
    foreach $field (@specifiedfieldorder) {
	if ($field eq "metadata") {
	    foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
		if (!defined $specifiedfields->{$newfield}) {
		    push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
		    push (@indexfields, "$newfield");
		}
	    }

	} elsif ($field eq 'text') {
	    push (@indexfieldmap, "text\-\>TX");
	    push (@indexfields, "text");
	} elsif ($field eq 'allfields') {
	    push (@indexfieldmap, "allfields\-\>ZZ");
	    push (@indexfields, "allfields");
	} else {
	    push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
	    push (@indexfields, "$field");
	    
	}
    }
    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;


}


# recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.)
sub read_final_field_list {
    my $self = shift (@_);
    $self->{'build_cfg'} = {};
    my @indexfieldmap = ();
    my @indexfields = ();
    
    if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
	# set the default mapping
	$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    }
    # we read the stuff in from the build.cfg file - if its there
    $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
    
    if (!-e $buildconfigfile) {
	# try the index dir - but do we know where it is?? try here
	$buildconfigfile  = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
	if (!-e $buildconfigfile) {
	    #we cant find a config file - just ignore the field list
	    return;
	}
    } 
    $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
    if (defined $buildcfg->{'indexfields'}) {
	foreach $field (@{$buildcfg->{'indexfields'}}) {
	    push (@indexfields, "$field");
	}
    }
    if (defined $buildcfg->{'indexfieldmap'}) {
	foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
	    push (@indexfieldmap, "$field");
	    ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
	    $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
	}
    }	    
    
    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
    
}
sub make_auxiliary_files {
    my $self = shift (@_);
    my ($index);
    
    my $build_cfg = {};
    # this already includes indexfieldmap and indexfields
    if (defined $self->{'build_cfg'}) {
	$build_cfg = $self->{'build_cfg'};
    }
    #my %build_cfg = ();
    
    my $outhandle =  $self->{'outhandle'};
    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
    print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};

    # get the text directory
    &util::mk_all_dir ($self->{'build_dir'});

    # store the build date
    $build_cfg->{'builddate'} = time;
    $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
    
    # store the level info
    my @indexlevels = ();
    foreach $l (@{$self->{'levelorder'}}) {
	push (@indexlevels, $level_map{$l});
    }
    $build_cfg->{'indexlevels'} = \@indexlevels;
    
    if ($self->{'levels'}->{'section'}) {
	$build_cfg->{'textlevel'} = $level_map{'section'};
    } else {	
	$build_cfg->{'textlevel'} = $level_map{'document'};
    }
    # store the number of documents and number of bytes
    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();

    # store the mapping between the index names and the directory names
    my @indexmap = ();
    foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
	if (not defined ($self->{'notbuilt'}->{$index})) {
	    push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
	}
    }
    $build_cfg->{'indexmap'} = \@indexmap;

    my @subcollectionmap = ();
    foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
	push (@subcollectionmap, "$subcollection\-\>" .
	      $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
    }
    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);

    my @languagemap = ();
    foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
	push (@languagemap, "$language\-\>" .
	      $self->{'index_mapping'}->{'languagemap'}->{$language});
    }
    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);

    #$build_cfg->{'notbuilt'} = $self->{'notbuilt'};
    my @notbuilt = ();
    foreach $nb (keys %{$self->{'notbuilt'}}) {
	push (@notbuilt, $nb);
    }
    $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);

    # write out the build information
    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
			     '^(builddate|buildtype|numdocs|numbytes|textlevel)$', 
                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');

    print STDERR "</Stage>\n" if $self->{'gli'};
}

sub deinit {
    my $self = shift (@_);
}

sub print_stats {
    my $self = shift (@_);

    my $outhandle = $self->{'outhandle'};
    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
    my $index = $self->{'buildproc'}->get_index();
    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();

    if ($indexing_text) {
	print $outhandle "Stats (Creating index $index)\n";
    } else {
	print $outhandle "Stats (Compressing text from $index)\n";
    }
    print $outhandle "Total bytes in collection: $num_bytes\n";
    print $outhandle "Total bytes in $index: $num_processed_bytes\n";

    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
	print $outhandle "***************\n";
	if ($indexing_text) {
	    print $outhandle "WARNING: There is very little or no text to process for $index\n";
	} elsif (!$self->{'no_text'}) {
	    print $outhandle "WARNING: There is very little or no text to compress\n";
	}	   
	print $outhandle "         Was this your intention?\n";
	print $outhandle "***************\n";
    }

}

1;