source: trunk/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl@ 3161

Last change on this file since 3161 was 1972, checked in by jmt14, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.5 KB
Line 
1#!/usr/bin/perl -w
2
3# kea-choose-best-phrase.pl
4# Version 1.1
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Witten et.al.
27# 1.1 First Distribution. GPL added.
28# 1.1.1 Don;t use STDERR.
29
30# Usage : kea-choose-best-phrase.pl <filestem>
31# The script assumes filestem.stemmed and filestem.unstemmed exist and
32# chooses the most frequent form of each phrase in filestem.unstemmed.
33
34# If a phrase of length 1 appears in initial capitals all the time,
35# then prefix it with "PN-" to show it is (probably) a proper-noun.
36
37# If a phrase of length 1 appears in initial capitals *most of* the time,
38# and if $mark_common_capitals = 1,
39# then prefix it with "PN-" to show it is (probably) a proper-noun.
40$mark_common_capitals = 0;
41
42# Get command-line arguments
43if (!$ARGV[0]) {
44 die "choose-best-phrase.pl <filestem>\n";
45}
46$stemmed = "$ARGV[0].stemmed";
47$unstemmed = "$ARGV[0].unstemmed";
48print "kea-choose-best-keyphrase.pl: $unstemmed\n";
49
50# catalog the unstemmed forms each stemmed phrase is derived from
51open(S, "<$stemmed");
52open(U, "<$unstemmed");
53
54while (<S>) {
55 chop($s = $_);
56 chop($u = <U>);
57 $f{$s}{$u}++;
58}
59close(S);
60close(U);
61
62# move the old ustemmed file out of the way
63`mv $unstemmed $unstemmed.old`;
64
65# make a new unstemmed file
66open(S, "<$stemmed");
67open(U, ">$unstemmed");
68
69while (<S>) {
70
71 chop($s = $_);
72 @variations = keys(%{$f{$s}});
73 # print "$s: ", join( ", ", @variations), "\n";
74
75 # print the best variation on the phrase
76
77 # if we have seen the phrase before, use the same form
78 if (defined($canon{$s})) {
79 # print " CANON: $canon{$s}";
80
81 # if there is only one form of this phrase, use it
82 } elsif ($#variations == 0) {
83 # if it is in Initial Caps mark it as a proper-noun
84
85 if ($variations[0] =~ /^[A-Z][^A-Z ]+$/) {
86 # this is a proper-noun (one word, initial capitals)
87 # print " PROPERNOUN ($f{$s}{$variations[0]}): PN-$variations[0]";
88 $canon{$s} = "PN-$variations[0]";
89 } else {
90 # use lowercase
91 # print " SOLO ($f{$s}{$variations[0]}): ", $variations[0];
92 $canon{$s} = $variations[0];
93 }
94
95 # make sure we print some form of each phrase
96 } else {
97 $most = 0;
98 $best = ();
99 foreach $v (@variations) {
100 if ($f{$s}{$v} > $most) {
101 $most = $f{$s}{$v};
102 $best = $v;
103 }
104 # print " $v\t$f{$s}{$v}\n";
105 }
106 # print " FIRST: $variations[0]\n";
107 # print " MOST ($most): $best";
108
109 # if the phrase is a single word and $mark_common_capitals = 1
110 # then we may want to mark this as a proper-noun.
111 if ($mark_common_capitals && ($best =~ /^[A-Z][^A-Z ]+$/)) {
112 # this is a proper-noun (one word, initial capitals)
113 $best = "PN-$best";
114 # print "$best\n";
115 }
116
117 $canon{$s} = $best;
118 }
119 # print "\n";
120 print U "$canon{$s}\n";
121}
122close(U);
Note: See TracBrowser for help on using the repository browser.