#!/usr/bin/perl -w # kea-choose-best-phrase.pl # Version 1.1 # Kea -- Automatic Keyphrase Extraction # Copyright 1998-1999 by Gordon Paynter and Eibe Frank # Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # Version history # # 1.0 Witten et.al. # 1.1 First Distribution. GPL added. # 1.1.1 Don;t use STDERR. # Usage : kea-choose-best-phrase.pl # The script assumes filestem.stemmed and filestem.unstemmed exist and # chooses the most frequent form of each phrase in filestem.unstemmed. # If a phrase of length 1 appears in initial capitals all the time, # then prefix it with "PN-" to show it is (probably) a proper-noun. # If a phrase of length 1 appears in initial capitals *most of* the time, # and if $mark_common_capitals = 1, # then prefix it with "PN-" to show it is (probably) a proper-noun. $mark_common_capitals = 0; # Get command-line arguments if (!$ARGV[0]) { die "choose-best-phrase.pl \n"; } $stemmed = "$ARGV[0].stemmed"; $unstemmed = "$ARGV[0].unstemmed"; print "kea-choose-best-keyphrase.pl: $unstemmed\n"; # catalog the unstemmed forms each stemmed phrase is derived from open(S, "<$stemmed"); open(U, "<$unstemmed"); while () { chop($s = $_); chop($u = ); $f{$s}{$u}++; } close(S); close(U); # move the old ustemmed file out of the way `mv $unstemmed $unstemmed.old`; # make a new unstemmed file open(S, "<$stemmed"); open(U, ">$unstemmed"); while () { chop($s = $_); @variations = keys(%{$f{$s}}); # print "$s: ", join( ", ", @variations), "\n"; # print the best variation on the phrase # if we have seen the phrase before, use the same form if (defined($canon{$s})) { # print " CANON: $canon{$s}"; # if there is only one form of this phrase, use it } elsif ($#variations == 0) { # if it is in Initial Caps mark it as a proper-noun if ($variations[0] =~ /^[A-Z][^A-Z ]+$/) { # this is a proper-noun (one word, initial capitals) # print " PROPERNOUN ($f{$s}{$variations[0]}): PN-$variations[0]"; $canon{$s} = "PN-$variations[0]"; } else { # use lowercase # print " SOLO ($f{$s}{$variations[0]}): ", $variations[0]; $canon{$s} = $variations[0]; } # make sure we print some form of each phrase } else { $most = 0; $best = (); foreach $v (@variations) { if ($f{$s}{$v} > $most) { $most = $f{$s}{$v}; $best = $v; } # print " $v\t$f{$s}{$v}\n"; } # print " FIRST: $variations[0]\n"; # print " MOST ($most): $best"; # if the phrase is a single word and $mark_common_capitals = 1 # then we may want to mark this as a proper-noun. if ($mark_common_capitals && ($best =~ /^[A-Z][^A-Z ]+$/)) { # this is a proper-noun (one word, initial capitals) $best = "PN-$best"; # print "$best\n"; } $canon{$s} = $best; } # print "\n"; print U "$canon{$s}\n"; } close(U);