#!/usr/bin/perl -w

#    kea-choose-best-phrase.pl
#    Version 1.1

#    Kea -- Automatic Keyphrase Extraction
#    Copyright 1998-1999 by Gordon Paynter and Eibe Frank
#    Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# Version history
#
# 1.0   Witten et.al.
# 1.1   First Distribution.  GPL added.
# 1.1.1 Don;t use STDERR.

# Usage : kea-choose-best-phrase.pl <filestem>
# The script assumes filestem.stemmed and filestem.unstemmed exist and
# chooses the most frequent form of each phrase in filestem.unstemmed.

# If a phrase of length 1 appears in initial capitals all the time,
# then prefix it with "PN-" to show it is (probably) a proper-noun.

# If a phrase of length 1 appears in initial capitals *most of* the time,
# and if $mark_common_capitals = 1,
# then prefix it with "PN-" to show it is (probably) a proper-noun.
$mark_common_capitals = 0;

# Get command-line arguments
if (!$ARGV[0]) { 
    die "choose-best-phrase.pl <filestem>\n";
}
$stemmed = "$ARGV[0].stemmed";
$unstemmed = "$ARGV[0].unstemmed";
print "kea-choose-best-keyphrase.pl: $unstemmed\n";

# catalog the unstemmed forms each stemmed phrase is derived from
open(S, "<$stemmed");
open(U, "<$unstemmed");

while (<S>) {
    chop($s = $_);
    chop($u = <U>);
    $f{$s}{$u}++;
}
close(S);
close(U);

# move the old ustemmed file out of the way
`mv $unstemmed $unstemmed.old`;

# make a new unstemmed file
open(S, "<$stemmed");
open(U, ">$unstemmed");

while (<S>) {

    chop($s = $_);
    @variations = keys(%{$f{$s}});
    # print "$s: ", join( ", ", @variations), "\n";

    # print the best variation on the phrase

    # if we have seen the phrase before, use the same form
    if (defined($canon{$s})) {
	# print "  CANON: $canon{$s}";

    # if there is only one form of this phrase, use it
    } elsif ($#variations == 0) {
	# if it is in Initial Caps mark it as a proper-noun
       
	if ($variations[0] =~ /^[A-Z][^A-Z ]+$/) {
	    # this is a proper-noun (one word, initial capitals)
	    # print "  PROPERNOUN ($f{$s}{$variations[0]}): PN-$variations[0]";
	    $canon{$s} = "PN-$variations[0]";
	} else {
	    # use lowercase
	    # print "  SOLO ($f{$s}{$variations[0]}): ", $variations[0];
	    $canon{$s} = $variations[0];
	}
    
    # make sure we print some form of each phrase
    } else {
	$most = 0;
	$best = (); 
	foreach $v (@variations) {
	    if ($f{$s}{$v} > $most) {
		$most = $f{$s}{$v};
		$best = $v;
	    }
	    # print "    $v\t$f{$s}{$v}\n";
	}
	# print "  FIRST: $variations[0]\n";
	# print "  MOST ($most): $best";

	# if the phrase is a single word and $mark_common_capitals = 1
	# then we may want to mark this as a proper-noun.
	if ($mark_common_capitals && ($best =~ /^[A-Z][^A-Z ]+$/)) {
	    # this is a proper-noun (one word, initial capitals)
	    $best = "PN-$best";
	    # print "$best\n";
	}

	$canon{$s} = $best;
    }
    # print "\n";
    print U "$canon{$s}\n";
}
close(U);