1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | # kea-choose-best-phrase.pl
|
---|
4 | # Version 1.1
|
---|
5 |
|
---|
6 | # Kea -- Automatic Keyphrase Extraction
|
---|
7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
8 | # Contact [email protected] or [email protected]
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 |
|
---|
24 | # Version history
|
---|
25 | #
|
---|
26 | # 1.0 Witten et.al.
|
---|
27 | # 1.1 First Distribution. GPL added.
|
---|
28 | # 1.1.1 Don;t use STDERR.
|
---|
29 |
|
---|
30 | # Usage : kea-choose-best-phrase.pl <filestem>
|
---|
31 | # The script assumes filestem.stemmed and filestem.unstemmed exist and
|
---|
32 | # chooses the most frequent form of each phrase in filestem.unstemmed.
|
---|
33 |
|
---|
34 | # If a phrase of length 1 appears in initial capitals all the time,
|
---|
35 | # then prefix it with "PN-" to show it is (probably) a proper-noun.
|
---|
36 |
|
---|
37 | # If a phrase of length 1 appears in initial capitals *most of* the time,
|
---|
38 | # and if $mark_common_capitals = 1,
|
---|
39 | # then prefix it with "PN-" to show it is (probably) a proper-noun.
|
---|
40 | $mark_common_capitals = 0;
|
---|
41 |
|
---|
42 | # Get command-line arguments
|
---|
43 | if (!$ARGV[0]) {
|
---|
44 | die "choose-best-phrase.pl <filestem>\n";
|
---|
45 | }
|
---|
46 | $stemmed = "$ARGV[0].stemmed";
|
---|
47 | $unstemmed = "$ARGV[0].unstemmed";
|
---|
48 | print "kea-choose-best-keyphrase.pl: $unstemmed\n";
|
---|
49 |
|
---|
50 | # catalog the unstemmed forms each stemmed phrase is derived from
|
---|
51 | open(S, "<$stemmed");
|
---|
52 | open(U, "<$unstemmed");
|
---|
53 |
|
---|
54 | while (<S>) {
|
---|
55 | chop($s = $_);
|
---|
56 | chop($u = <U>);
|
---|
57 | $f{$s}{$u}++;
|
---|
58 | }
|
---|
59 | close(S);
|
---|
60 | close(U);
|
---|
61 |
|
---|
62 | # move the old ustemmed file out of the way
|
---|
63 | `mv $unstemmed $unstemmed.old`;
|
---|
64 |
|
---|
65 | # make a new unstemmed file
|
---|
66 | open(S, "<$stemmed");
|
---|
67 | open(U, ">$unstemmed");
|
---|
68 |
|
---|
69 | while (<S>) {
|
---|
70 |
|
---|
71 | chop($s = $_);
|
---|
72 | @variations = keys(%{$f{$s}});
|
---|
73 | # print "$s: ", join( ", ", @variations), "\n";
|
---|
74 |
|
---|
75 | # print the best variation on the phrase
|
---|
76 |
|
---|
77 | # if we have seen the phrase before, use the same form
|
---|
78 | if (defined($canon{$s})) {
|
---|
79 | # print " CANON: $canon{$s}";
|
---|
80 |
|
---|
81 | # if there is only one form of this phrase, use it
|
---|
82 | } elsif ($#variations == 0) {
|
---|
83 | # if it is in Initial Caps mark it as a proper-noun
|
---|
84 |
|
---|
85 | if ($variations[0] =~ /^[A-Z][^A-Z ]+$/) {
|
---|
86 | # this is a proper-noun (one word, initial capitals)
|
---|
87 | # print " PROPERNOUN ($f{$s}{$variations[0]}): PN-$variations[0]";
|
---|
88 | $canon{$s} = "PN-$variations[0]";
|
---|
89 | } else {
|
---|
90 | # use lowercase
|
---|
91 | # print " SOLO ($f{$s}{$variations[0]}): ", $variations[0];
|
---|
92 | $canon{$s} = $variations[0];
|
---|
93 | }
|
---|
94 |
|
---|
95 | # make sure we print some form of each phrase
|
---|
96 | } else {
|
---|
97 | $most = 0;
|
---|
98 | $best = ();
|
---|
99 | foreach $v (@variations) {
|
---|
100 | if ($f{$s}{$v} > $most) {
|
---|
101 | $most = $f{$s}{$v};
|
---|
102 | $best = $v;
|
---|
103 | }
|
---|
104 | # print " $v\t$f{$s}{$v}\n";
|
---|
105 | }
|
---|
106 | # print " FIRST: $variations[0]\n";
|
---|
107 | # print " MOST ($most): $best";
|
---|
108 |
|
---|
109 | # if the phrase is a single word and $mark_common_capitals = 1
|
---|
110 | # then we may want to mark this as a proper-noun.
|
---|
111 | if ($mark_common_capitals && ($best =~ /^[A-Z][^A-Z ]+$/)) {
|
---|
112 | # this is a proper-noun (one word, initial capitals)
|
---|
113 | $best = "PN-$best";
|
---|
114 | # print "$best\n";
|
---|
115 | }
|
---|
116 |
|
---|
117 | $canon{$s} = $best;
|
---|
118 | }
|
---|
119 | # print "\n";
|
---|
120 | print U "$canon{$s}\n";
|
---|
121 | }
|
---|
122 | close(U);
|
---|