###########################################################################
#
# phind.pm -- the Phind classifier
#
# Copyright (C) 2000 Gordon W. Paynter
# Copyright (C) 2000 New Zealand Digital Library Project
#
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# The phind clasifier plugin
#
# options are:
#   title=Title           The title field for this classification 
#   text=fields           The text used to build the phrase hierarchy
#   phindexdir=directory  Location of phind index files
#   verbosity=num         Control amount of output
#   untidy=true           Do not clean up intermediate files
#   suffixmode=num        Mode of suffix program (0 = all phrases, 1 = stopword)
#   suffixsize=num        Number of symbols available to suffix program


# How a classifier works.  
#
# When a classifier is requested in the collect.cfg file, buildcol creates a
# new classifier object (such as the one defined in theis file) and later
# passes each document object to the classifier in turn.  Four functions are
# used:
#
# 1. "new" is called before the documents are processed to set up the
#    classifier.
#
# 2. "init" is called after buildcol.pl has created the indexes etc but
#    before the documents are classified in order that the classifier might
#    set any varioables it requiers, etc.
#
# 3. "classify" is called once for each document object.  The classifier
#    "classifies" each document and updates its local data accordingly.
#
# 4. "get_classify_info" is called after every document has been
#    classified.  It collates the information about the documents and
#    stores a reference to the classifier so that Greenstone can later
#    display it.


package phind;

use BasClas;
use util;

sub BEGIN {
    @ISA = ('BasClas');
}

# Define delimiter symbols - this should be abstracted out someplace
my $colstart = "COLLECTIONSTART";
my $colend   = "COLLECTIONEND";
my $doclimit = "DOCUMENTLIMIT";
my $senlimit = "SENTENCELIMIT";
my @delimiters = ($colstart, $colend, $doclimit, $senlimit);

# Create a new phind browser based on the options in collect.cfg

sub new {
    my ($class, @options) = @_;
    my $self = new BasClas ($class, @_);
    my $out = pop @options;

    # Phind installation check
    # The phind phrase browser is research software and is not installed
    # by defualt.  If the user attepts to use it we warn them that it's a
    # bit dodgy, then tell them how to install it.  If they can do that 
    # and get all the files in place, then we let them proceed.
	
    print $out "The Phind classifier for Greenstone.\n";
    print $out "Checking the phind phrase browser requirements...\n";

    # Make sure we're not in windows
    if ($ENV{'GSDLOS'} =~ /windows/i) {
	print STDERR "Phind currently only works under Unix";
	exit(1);
    }

    # Ensure the Phind generate scripts are in place
    my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
    my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");

    if (!(-e $file1)) {
	print STDERR "The phind \"suffix\" program is not installed.  ";
	print STDERR "To install it, change to the directory\n";
	print STDERR "   $src\n";
	print STDERR "and type \"make install-phind\".\n\n";
	exit(1);
    }

    # Ensure the Phind CGI script is in place
    $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
    $src =  &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");

    if (!(-e $file1)) {
	print STDERR "The phind CGI program is not installed.  ";
	print STDERR "To install it, change to the directory\n";
	print STDERR "   $src\n";
	print STDERR "and type \"make install-phind\".\n\n";
	exit(1);
    }

    # Ensure the Phind Java applet is in place
    $src =  &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
    $file1 = &util::filename_cat($src, "Phind.class");

    if (!(-e $file1)) {
	print STDERR "The phind Java classes are not compiled.  ";
	print STDERR "To compile them, change to the directory\n";
	print STDERR "   $src\n";
	print STDERR "and use your Java compiler to compile Phind.java.\n";
	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
	exit(1);
    }
	

    # The installation appears OK - set up the classifier
    my $collection = $ENV{'GSDLCOLLECTION'};
    my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
    my $language = "english";

    my $title = "Topic";
    my $indexes = "section:Title,section:text";

    my $suffixmode = 1;
    my $suffixsize = 40000000;

    my $verbosity = 2;
    my $untidy = 0;

    # parse the options
    foreach $option (@options) {

	if ($option =~ /^text=(.*)$/i) {
	    $indexes = $1;
	} elsif ($option =~ /^title=(.*)$/i) {
	    $title = $1;
	} elsif ($option =~ /^phindexdir=(.*)$/i) {
	    $phindexdir = $1;
	} elsif ($option =~ /^suffixsize=(.*)$/i) {
	    $suffixsize = $1;
	} elsif ($option =~ /^suffixmode=(.*)$/i) {
	    $suffixmode = $1;
	} elsif ($option =~ /^verbosity=(.*)$/i) {
	    $verbosity = $1;
	} elsif ($option =~ /^untidy/i) {
	    $untidy = 1;
	}
    }


    $self->{'collection'} = $collection;
    $self->{'title'} = $title;
    $self->{'indexes'} = $indexes;

    $self->{'suffixmode'} = $suffixmode;
    $self->{'suffixsize'} = $suffixsize;

    $self->{'verbosity'} = $verbosity;
    $self->{'untidy'} = $untidy;

    # limit languages
    $language =~ s/,/\|/g;
    $self->{'language_exp'} = $language;
    $self->{'delimiter'} = $delimiter;

    # reset phindex directory
    if (-e "$phindexdir") {
	&util::rm_r("$phindexdir");
    }
    &util::mk_dir("$phindexdir");
    $self->{'phindexdir'} = $phindexdir;

    return bless $self, $class;
}


# Initialise the phind classifier

sub init {
    my $self = shift (@_);

    # open filehandles for documents and text
    my $phindexdir = $self->{'phindexdir'};

    my $clausefile =  &util::filename_cat("$phindexdir", "clauses");
    &util::rm($clausefile) if (-e $clausefile);
    open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
    $self->{'txthandle'} = TEXT;

    my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
    &util::rm($docfile) if (-e $docfile);
    open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
    $self->{'dochandle'} = DOCS;
    
}

# Classify each document.
#
# Each document is passed here in turn.  The classifier extracts the 
# text of each and stores it in the clauses file.  Document details are
# stored in the docs.txt file. 

sub classify {
    my $self = shift (@_);
    my ($doc_obj) = @_;

    my $verbosity = $self->{'verbosity'};
    my $top_section = $doc_obj->get_top_section();

    my $title = $doc_obj->get_metadata_element ($top_section, "Title");
    print "process: $title\n" if ($verbosity > 2);


    # only consider english-language files
    my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
    my $phrlanguage = $self->{'language_exp'};
    return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
    
    # record this file
    my $total++;
    print "file $total: $file\n" if ($self->{'$verbosity'});
    

    # Store document details
    my $OID = $doc_obj->get_OID();
    $OID = "NULL" unless defined $OID;
    my $dochandle = $self->{'dochandle'};
    print $dochandle "<Document>\t$OID\t$title\n";
    
    # Store the text occuring in this object

    # output the document delimiter
    my $txthandle = $self->{'txthandle'};
    print $txthandle "$doclimit\n";

    # iterarate over the required indexes and store their text
    my $indexes = $self->{'indexes'};
    my $text = "";
    my ($part, $level, $field, $section, $data);

    foreach $part (split(/,/, $indexes)) {

	# Each field has a level and a data element ((e.g. document:Title)
	($level, $field) = split(/:/, $part);
	die unless ($level && $field);

	# Extract the text from every section
	# (In phind, document:text and section:text are equivalent)
	if ($field eq "text") {
	    $data = "";
	    $section = $doc_obj->get_top_section();
	    while (defined($section)) {
		$data .= $doc_obj->get_text($section) . "\n";
		$section = $doc_obj->get_next_section($section);
	    }
	    $text .= convert_gml_to_tokens($data) . "\n";
	}
	
	# Extract a metadata field from a document
	elsif ($level eq "document") {
	    $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
	    $text .= convert_gml_to_tokens($data) . "\n";
	} 
	
	# Extract metadata from every section in a document
	elsif ($level eq "section") {
	    $data = "";
	    $section = $doc_obj->get_top_section();
	    while (defined($section)) {
		$data .= $doc_obj->get_metadata_element($section, $field) . "\n";
		$section = $doc_obj->get_next_section($section);
	    }
	    $text .= convert_gml_to_tokens($data) . "\n";
	} 
	
	# Some sort of specification which I don't understand
	else {
	    die "Unknown level ($level) in phind index ($part)\n";
	}
	
    }
    
    # output the text
    $text =~ tr/\n//s;
    print $txthandle "$text";
    
}


# Construct the classifier from the information already gathered
#
# When get_classify_info is called, the clauses and docs.txt files have
# already been constructed in the phindex directory.  This function will
# translate them into compressed, indexed MGPP files that can be read by
# the phindcgi script.  It will also register our classifier so that it
# shows up in thenavigation bar.

sub get_classify_info {
    my $self = shift (@_);

    my $verbosity = $self->{'verbosity'};
    my $phindexdir = $self->{'phindexdir'};
    my $language = "english";
    
    if ($verbosity) {
	print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
    }

    # Construct phind indexes
    my $suffixmode = $self->{'suffixmode'};
    my $suffixsize = $self->{'suffixsize'};
    my ($command, $status);
    
    # Generate the vocabulary, symbol statistics, and numbers file
    # from the clauses file
    print "\nExtracting vocabulary and statistics\n" if $verbosity;
    &extract_vocabulary($phindexdir, $language, $verbosity);

    # Use the suffix program to generate the phindex/phrases file
    print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
    &execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity);

    # Create the phrase file and put phrase numbers in phindex/phrases
    print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
    &renumber_phrases("$phindexdir", $verbosity);
    
    # Create the mg phrase database
    my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
    my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
    my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");

    my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
    my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
    my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
    my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");

    print "\nCreating phrase databases\n";
    my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
    my $mg_stem = "pdata";

    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);

    # create the mg index of words
    print "\nCreating word-level search indexes\n";
    $mg_input = &util::filename_cat($phindexdir, "pword.txt");
    $mg_stem = "pword";

    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
    &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
    &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);

    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
    &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);

    # create the mg document information database
    print "\nCreating document information databases\n";
    $mg_input = &util::filename_cat($phindexdir, "docs.txt");
    $mg_stem = "docs";

    &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
    &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
    &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);


    # Tidy up stray files
    if (!$untidy) {
	print "\nCleaning up\n" if ($verbosity > 2);
	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers", 
		  "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats", 
		  "$phindexdir/phrases", "$phindexdir/docs.txt",
		  "$phindexdir/pdata.txt", "$phindexdir/pword.txt");
	my $outfile = 1;
	while (-e "$phindexdir/outPhrase.$outfile") {
	    &util::rm("$phindexdir/outPhrase.$outfile");
	    $outfile++;
	}
    }


    # Insert the classifier into.... what?
    my $collection = $self->{'collection'};
    my $url = "library?a=p&p=phind&c=$collection";

    my %classifyinfo = ('thistype'=>'Invisible',
			'childtype'=>'Phind',
			'Title'=>$self->{'title'},
			'contains'=>[]);
    
    push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
    return \%classifyinfo;
}


sub convert_gml_to_tokens {
    
    $_ = shift @_;

    # FIRST, remove GML tags

    # Replace all whitespace with a simple space
    s/\s+/ /gs;

    # Remove everything that is in a tag
    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
    s/\s*<br>\s*/ LINEBREAK /isg;
    s/<[^>]*>/ /sg;

    # Now we have the text, but it may contain HTML 
    # elements coded as &gt; etc.  Remove these tags. 
    s/&lt;/</sg;
    s/&gt;/>/sg;

    s/\s+/ /sg;
    s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
    s/\s*<br>\s*/ LINEBREAK /isg;
    s/<[^>]*>/ /sg;

    # remove &amp; and other miscellaneous markup tags
    s/&amp;/&/sg;
    s/&lt;/</sg;
    s/&gt;/>/sg;
    s/&amp;/&/sg;

    # replace<p> and <br> placeholders with carriage returns
    s/PARAGRAPHBREAK/\n/sg;
    s/LINEBREAK/\n/sg;


    # Exceptional punctuation
    # 
    # We make special cases of some punctuation

    # remove any apostrophe that indicates omitted letters 
    s/(\w+)\'(\w*\s)/ $1$2 /g;

    # remove period that appears in a person's initals
    s/\s([A-Z])\./ $1 /g;

    # replace hyphens in hypheanted words and names with a space
    s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;


    # Convert the remaining text to "clause format",
    # This means removing all excess punctuation and garbage text,
    # normalising valid punctuation to fullstops and commas,
    # then putting one cluse on each line.

    # Insert newline when the end of a sentence is detected
    # (delimter is:  "[\.\?\!]\s")
    s/\s*[\.\?\!]\s+/\n/g; 

    # split numbers after four digits
    s/(\d\d\d\d)/$1 /g;

    # split words after 32 characters

    # squash repeated punctuation 
    tr/A-Za-z0-9 //cs;

    # save email addresses
    # s/\w+@\w+\.[\w\.]+/EMAIL/g;

    # normalise clause breaks (mostly punctuation symbols) to commas
    s/[^A-Za-z0-9 \n]+/ , /g;

    # Remove repeated commas, and replace with newline
    s/\s*,[, ]+/\n/g;

    # remove extra whitespace
    s/ +/ /sg;
    s/^\s+//mg;
    s/\s*$/\n/mg;

    # remove lines that contain one word or less
    s/^\w*$//mg;
    s/^\s*$//mg;
    tr/\n//s;

    return $_;
}


# Execute a system command

sub execute {
    my ($command, $verbosity) = @_;
    print "Executing: $command\n"  if ($verbosity > 2);
    my $status = system($command);
    if ($status != 0) {
	print STDERR "phindgen.pl - Error executing $command: $!\n";
	exit($status);
    }
}


# Generate the vocabulary, symbol statistics, and numbers file from the
# clauses file.  This is legacy code, so is a bit messy and probably wont
# run under windows.

sub extract_vocabulary {
    my ($phindex_dir, $language, $verbosity) = @_;

    my ($w, $l, $line, $word);

    my ($first_delimiter, $last_delimiter,
	$first_stopword, $last_stopword,
	$first_extractword, $last_extractword,
	$first_contentword, $last_contentword,
	$phrasedelimiter);

    my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);


    my %symbol;
    my (%freq);

    print "Calculating vocabulary\n" if ($verbosity > 1);

    # Read and store the stopwords
    my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
    my %stopwords;
    foreach my $w (split(/\s+/, $words)) {
	$l = lc($w);
	$stopwords{$l} = $w;
    }
    
    # Read and store the thesaurus terms
    $use_thesaurus = 0;
    my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind", 
				       "$language", "agrovoc.lex");
    if (-e "$lex_file") {
	open(TH, "<$lex_file");
	while(<TH>) {
	    s/^\d+ //;
	    s/\(.*\)//;
	    foreach my $w (split(/\s+/, $_)) {
		$thesaurus{lc($w)} = $w;
	    }
	}
	close TH;
	$use_thesaurus = 1;
    }

    # Read words in the text and count occurences
    open(TXT, "<$phindex_dir/clauses");
    my @words;
    
    while(<TXT>) {
	$line = $_;
	next unless ($line =~ /./);
	
	@words = split(/\s+/, $line);
	foreach $w (@words) {
	    $l = lc($w);
	    $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
	    $freq{$w}++;
	}
	$freq{$senlimit}++;
    }
    
    # Calculate the "best" form of each word
    my (%bestform, %totalfreq, %bestfreq);

    foreach $w (sort (keys %freq)) {
	$l = lc($w);
	
	# totalfreq is the number of times a term appears in any form
	$totalfreq{$l} += $freq{$w};
	
	if (defined $stopwords{$l}) {
	    $bestform{$l} = $stopwords{$l};
	    
	} elsif (defined $thesaurus{$l}) {
	    $bestform{$l} = $thesaurus{$l};
	    
	} elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
	    $bestfreq{$l} = $freq{$w};
	    $bestform{$l} = $w;
	}
    }
    
    undef %freq;
    undef %bestfreq;
    

    # Assign symbol numbers to tokens
    my $nextsymbol = 1;
    my (@vocab);
    
    # Delimiters
    $first_delimiter = 1;
    
    foreach $word (@delimiters) {

	$word = lc($word);
	$bestform{$word} = uc($word);
	$vocab[$nextsymbol] = $word;
	$symbol{$word} = $nextsymbol;
	$nextsymbol++;
    }
    $last_delimiter = $nextsymbol - 1;
    
    # Stopwords
    $first_stopword = $nextsymbol;
    
    foreach my $word (sort keys %stopwords) {
	
	# don't incluse stopword unless it occurs in the text
	$word = lc($word);
	next unless ($totalfreq{$word});
	next if ($symbol{$word});
	
	$vocab[$nextsymbol] = $word;
	$symbol{$word} = $nextsymbol;
	$nextsymbol++;
    }
    $last_stopword = $nextsymbol - 1;
    $first_contentword = $nextsymbol;
    
    # Thesaurus terms
    if ($use_thesaurus) {
	$first_thesaurusword = $nextsymbol;
    
	foreach my $word (sort keys %thesaurus) {
	    
	    $word = lc($word);
	    next if ($symbol{$word});
	    $bestform{$word} = $thesaurus{$word};
	    
	    $vocab[$nextsymbol] = $word;
	    $symbol{$word} = $nextsymbol;
	    $nextsymbol++;
	    
	}
	$last_thesaurusword = $nextsymbol - 1;
    }

    # Other content words
    $first_extractword = $nextsymbol;
    
    foreach my $word (sort (keys %bestform)) {
	
	next if ($symbol{$word});
	
	$vocab[$nextsymbol] = $word;
	$symbol{$word} = $nextsymbol;
	$nextsymbol++;
    }
    $last_extractword = $nextsymbol - 1;
    $last_contentword = $nextsymbol - 1;
    

    # Outut the words
    print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
    open(VOC, ">$phindex_dir/clauses.vocab");

    for (my $i = 1; $i < $nextsymbol; $i++) {
	$w = $vocab[$i];

	print VOC "$bestform{$w}\n";
	$totalfreq{$w} = 0 unless ($totalfreq{$w});
    }
    close VOC;


    # Output statistics about the vocablary
    print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
    &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
    open(STAT, ">$phindex_dir/clauses.stats")
	|| die "Cannot open $phindex_dir/clauses.stats: $!";

    print STAT "first_delimiter $first_delimiter\n";
    print STAT "last_delimiter $last_delimiter\n";
    print STAT "first_stopword $first_stopword\n";
    print STAT "last_stopword $last_stopword\n";
    if ($use_thesaurus) {
	print STAT "first_thesaurusword $first_thesaurusword\n";
	print STAT "last_thesaurusword $last_thesaurusword\n";
    }
    print STAT "first_extractword $first_extractword\n";
    print STAT "last_extractword $last_extractword\n";
    print STAT "first_contentword $first_contentword\n";
    print STAT "last_contentword $last_contentword\n";
    print STAT "first_symbol $first_delimiter\n";
    print STAT "last_symbol $last_contentword\n";
    print STAT "first_word $first_stopword\n";
    print STAT "last_word $last_contentword\n";
    close STAT;

    undef @vocab;


    # Save text as symbol numbers
    print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
    
    open(TXT, "<$phindex_dir/clauses");
    open(NUM, ">$phindex_dir/clauses.numbers");
    
    $phrasedelimiter = $symbol{lc($senlimit)};
    print NUM "$symbol{lc($colstart)}\n";
    
    # set up the special symbols that delimit documents and sentences
    while(<TXT>) {
	
	# split sentence into a list of tokens
	$line = $_;
	next unless ($line =~ /./);
	@words = split(/\s+/, $line);
	
	# output one token at a time
	foreach $word (@words) {
	    $word = lc($word);
	    print NUM "$symbol{$word}\n";
	}
	
	# output phrase delimiter
	print NUM "$phrasedelimiter\n";
    }
    
    print NUM "$symbol{lc($colend)}\n";

}


# Prepare the phrases file to be input to mgpp.
# This means renumbering the phrases in order of decreasing frequency.


# This is legacy code, and a little ugly, and may be unix-specific
# (particularly the sort command).

sub renumber_phrases {
    my ($phindex_dir, $verbosity) = @_;

    # Sort the phrases into order of increasing frequency
    # This means the expansions will be sorted correctly later on.
    print "Sorting phrases into freq order\n" if ($verbosity);
    system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");

    my @symbol;

    # Read the vocabulary
    print "Reading the vocabulary\n" if ($verbosity);
    open(V, "<$phindex_dir/clauses.vocab")
	|| die "Cannot open $phindex_dir/clauses.vocab: $!";
    
    my $i = 1;
    while(<V>) {
	chomp;
	$symbol[$i++] = $_;
    }

    # Create file for phrase data
    #
    # The phrases file looks something like this
    #  159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
    #  159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
    #  159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
    #  159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361

    # The first field on each line is a unique phrase identifier.  
    # We need to calculate phrase numbers for each phrase
    print "Calculate phrase numbers\n" if ($verbosity);
    
    my %phrasenumber;
    my $nextphrase = 1;
    my ($line);
    
    open(IN, "<$phindex_dir/phrases");
    while(<IN>) {
	
	# read the line
	chomp;
	$line = $_;
	
	# we're only interested in the first field
	$line =~ s/:.*//;
	
	# get a phrase number for this line
	$phrasenumber{$line} = $nextphrase;
	$nextphrase++;
    }
    

    # Now we create a new phrase file using phrase numbers, not the old IDs.
    print "Format phrase data for MGPP\n" if ($verbosity);
    
    open(IN, "<$phindex_dir/phrases");
    open(DATA, ">$phindex_dir/pdata.txt");
    open(IDX, ">$phindex_dir/pword.txt");
    
    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
    my @fields;
    my @documents;
    my (@newexp, $k, $n);
    
    my $linenumber = 0;

    while(<IN>) {
	
	# read the line
	chomp;
	$line = $_;
	@fields = split(/:/, $line);
	
	# get a phrase number for this line
	$key = shift @fields;
	die unless (defined($phrasenumber{$key}));
	$num = $phrasenumber{$key};
	
	# get the text of the phrase
	$text = shift @fields;
	$text =~ s/s(\d+)/$symbol[$1]/g;
	if ($text =~ / /) {
	    $word = "";
	} else {
	    $word = $text;
	}

	$linenumber++;
	if ($linenumber % 1000 == 0) {
	    print "line $linenumber:\t$num\t$key\t($text)\n"  if ($verbosity > 2);
	}
	print "$num: $key\t($text)\n" if ($verbosity > 3);
	
	# get the phrase frequency
	$tf = shift @fields;
	
	# get the number of expansions
	$countexp = shift @fields;
	
	# get the expansions and convert them into phrase numbers
	$expansions = shift @fields;
	@newexp = ();
	foreach $k (split(/,/, $expansions)) {
	    die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
	    $n = $phrasenumber{$k};
	    push @newexp, $n;
	}
	@newexp = sort numerically @newexp;

	# get the number of documents
	$countdocs = shift @fields;
	
	# get the documents
	$documents = shift @fields;
	$documents =~ s/d//g;
	@documents = split(/;/, $documents);
	@documents = sort by_frequency @documents;

	# output the phrase data
	print DATA "<Document>";
 	print DATA "$num:$text:$tf:$countexp:$countdocs:";
	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
	
	# output the word index search data
	print IDX "<Document>$word\n";

	
    }
}

# sort routines used to renumber phrases

sub numerically { $a <=> $b }

sub by_frequency { 
    my $fa = 1;
    if ($a =~ /,/) {
	$fa = $a;
	$fa =~ s/\d+,//;
    }
    my $fb = 1;
    if ($b =~ /,/) {
	$fb = $b;
	$fb =~ s/\d+,//;
    }

    return ($fb <=> $fa);
}


1;