[1972] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | # kea-choose-best-phrase.pl
|
---|
| 4 | # Version 1.1
|
---|
| 5 |
|
---|
| 6 | # Kea -- Automatic Keyphrase Extraction
|
---|
| 7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
| 8 | # Contact [email protected] or [email protected]
|
---|
| 9 | #
|
---|
| 10 | # This program is free software; you can redistribute it and/or modify
|
---|
| 11 | # it under the terms of the GNU General Public License as published by
|
---|
| 12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | # (at your option) any later version.
|
---|
| 14 | #
|
---|
| 15 | # This program is distributed in the hope that it will be useful,
|
---|
| 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | # GNU General Public License for more details.
|
---|
| 19 | #
|
---|
| 20 | # You should have received a copy of the GNU General Public License
|
---|
| 21 | # along with this program; if not, write to the Free Software
|
---|
| 22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 |
|
---|
| 24 | # Version history
|
---|
| 25 | #
|
---|
| 26 | # 1.0 Witten et.al.
|
---|
| 27 | # 1.1 First Distribution. GPL added.
|
---|
| 28 | # 1.1.1 Don;t use STDERR.
|
---|
| 29 |
|
---|
| 30 | # Usage : kea-choose-best-phrase.pl <filestem>
|
---|
| 31 | # The script assumes filestem.stemmed and filestem.unstemmed exist and
|
---|
| 32 | # chooses the most frequent form of each phrase in filestem.unstemmed.
|
---|
| 33 |
|
---|
| 34 | # If a phrase of length 1 appears in initial capitals all the time,
|
---|
| 35 | # then prefix it with "PN-" to show it is (probably) a proper-noun.
|
---|
| 36 |
|
---|
| 37 | # If a phrase of length 1 appears in initial capitals *most of* the time,
|
---|
| 38 | # and if $mark_common_capitals = 1,
|
---|
| 39 | # then prefix it with "PN-" to show it is (probably) a proper-noun.
|
---|
| 40 | $mark_common_capitals = 0;
|
---|
| 41 |
|
---|
| 42 | # Get command-line arguments
|
---|
| 43 | if (!$ARGV[0]) {
|
---|
| 44 | die "choose-best-phrase.pl <filestem>\n";
|
---|
| 45 | }
|
---|
| 46 | $stemmed = "$ARGV[0].stemmed";
|
---|
| 47 | $unstemmed = "$ARGV[0].unstemmed";
|
---|
| 48 | print "kea-choose-best-keyphrase.pl: $unstemmed\n";
|
---|
| 49 |
|
---|
| 50 | # catalog the unstemmed forms each stemmed phrase is derived from
|
---|
| 51 | open(S, "<$stemmed");
|
---|
| 52 | open(U, "<$unstemmed");
|
---|
| 53 |
|
---|
| 54 | while (<S>) {
|
---|
| 55 | chop($s = $_);
|
---|
| 56 | chop($u = <U>);
|
---|
| 57 | $f{$s}{$u}++;
|
---|
| 58 | }
|
---|
| 59 | close(S);
|
---|
| 60 | close(U);
|
---|
| 61 |
|
---|
| 62 | # move the old ustemmed file out of the way
|
---|
| 63 | `mv $unstemmed $unstemmed.old`;
|
---|
| 64 |
|
---|
| 65 | # make a new unstemmed file
|
---|
| 66 | open(S, "<$stemmed");
|
---|
| 67 | open(U, ">$unstemmed");
|
---|
| 68 |
|
---|
| 69 | while (<S>) {
|
---|
| 70 |
|
---|
| 71 | chop($s = $_);
|
---|
| 72 | @variations = keys(%{$f{$s}});
|
---|
| 73 | # print "$s: ", join( ", ", @variations), "\n";
|
---|
| 74 |
|
---|
| 75 | # print the best variation on the phrase
|
---|
| 76 |
|
---|
| 77 | # if we have seen the phrase before, use the same form
|
---|
| 78 | if (defined($canon{$s})) {
|
---|
| 79 | # print " CANON: $canon{$s}";
|
---|
| 80 |
|
---|
| 81 | # if there is only one form of this phrase, use it
|
---|
| 82 | } elsif ($#variations == 0) {
|
---|
| 83 | # if it is in Initial Caps mark it as a proper-noun
|
---|
| 84 |
|
---|
| 85 | if ($variations[0] =~ /^[A-Z][^A-Z ]+$/) {
|
---|
| 86 | # this is a proper-noun (one word, initial capitals)
|
---|
| 87 | # print " PROPERNOUN ($f{$s}{$variations[0]}): PN-$variations[0]";
|
---|
| 88 | $canon{$s} = "PN-$variations[0]";
|
---|
| 89 | } else {
|
---|
| 90 | # use lowercase
|
---|
| 91 | # print " SOLO ($f{$s}{$variations[0]}): ", $variations[0];
|
---|
| 92 | $canon{$s} = $variations[0];
|
---|
| 93 | }
|
---|
| 94 |
|
---|
| 95 | # make sure we print some form of each phrase
|
---|
| 96 | } else {
|
---|
| 97 | $most = 0;
|
---|
| 98 | $best = ();
|
---|
| 99 | foreach $v (@variations) {
|
---|
| 100 | if ($f{$s}{$v} > $most) {
|
---|
| 101 | $most = $f{$s}{$v};
|
---|
| 102 | $best = $v;
|
---|
| 103 | }
|
---|
| 104 | # print " $v\t$f{$s}{$v}\n";
|
---|
| 105 | }
|
---|
| 106 | # print " FIRST: $variations[0]\n";
|
---|
| 107 | # print " MOST ($most): $best";
|
---|
| 108 |
|
---|
| 109 | # if the phrase is a single word and $mark_common_capitals = 1
|
---|
| 110 | # then we may want to mark this as a proper-noun.
|
---|
| 111 | if ($mark_common_capitals && ($best =~ /^[A-Z][^A-Z ]+$/)) {
|
---|
| 112 | # this is a proper-noun (one word, initial capitals)
|
---|
| 113 | $best = "PN-$best";
|
---|
| 114 | # print "$best\n";
|
---|
| 115 | }
|
---|
| 116 |
|
---|
| 117 | $canon{$s} = $best;
|
---|
| 118 | }
|
---|
| 119 | # print "\n";
|
---|
| 120 | print U "$canon{$s}\n";
|
---|
| 121 | }
|
---|
| 122 | close(U);
|
---|