source: trunk/gsdl/perllib/Kea-1.1.4/Kea@ 3161

Last change on this file since 3161 was 1995, checked in by jmt14, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1#!/usr/bin/perl -w
2
3# Kea
4# Version 1.1.4
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Witten et.al.
27# 1.0.1 Bug: stopword file loaded as model file
28# 1.0.2 Java paths explicit for nikau; JIT compiler
29# 1.0.3 Include tf.idf in output if -t is set
30# 1.0.4 Allow optional keyphrase frequency file
31# 1.0.5 Use $perl_command and $java_command for system-indepence
32# 1.0.6 -C argument selects model, stopword file, df file
33# 1.0.7 Changes to Kea.pl.
34# This is Phillip's version
35# 1.0.8 Accepts .htm as well as .html
36# 1.0.9 Accepts .text as well as .txt
37# 1.1 First Distribution. GPL added. Documentation added.
38# 1.1.1 -E argument sets output extension (default is .kea)
39# 1.1.2 Documented java variables
40# Maximum phrase length can be set at command-line.
41# Note: default=3; NOT the length for the model. Sorry.
42# 1.1.3 Moved Lynx command into separate script that checks
43# for circumstances that are likely to crash it.
44# 1.1.4 Updated documentation and added a few extra files.
45
46print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n";
47
48$gsdlhome = $ENV{'GSDLHOME'};
49
50`gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`;
51
52
53# Kea runs other perl scripts in shells.
54$perl_command = "perl -w";
55
56# Java is a bit more difficult.
57# set this variable to your java home directory
58#$java_home = "/usr/local/jdk";
59$java_home = "/usr/local/share/java";
60
61# this variable will hold the CLASSPATH for java which we set at
62# the command line to incorporate jaws.jar; you shouldn't need to change it
63$java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
64
65# The name of your java just-in-time compiler. I use TYA.
66# An empty string means you don;t have a JIT compiler.
67$java_JIT_compiler = "";
68#$java_JIT_compiler = "tya";
69
70# If you want to give java lots of memory or use other arguments,
71# use this variable, otherwise make it an empty string.
72$java_extra_args = "";
73$java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
74
75# The actual java command is based on these other variables:
76$java_command = "$java_home/bin/java -classpath \"$java_classpath\"";
77$java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
78$java_command .= " $java_extra_args" if ($java_extra_args);
79
80
81# Parse command line options
82require("getopts.pl");
83&Getopts("dtN:E:C:F:K:L:M:S:");
84
85# What files shall we use?
86if (!$ARGV[0]) {
87 die "Usage: Kea [options] text-or-html-or-cstr-files
88Options:
89 -d Debug mode
90 -t Ouput TF.IDF
91 -N n Output n keyphrases
92 -L n Maximum phrases length is n (default = 3)
93 -E <suffix> Output extension is <suffix>
94 -C <corpus> Use model/df/kf/stopwords based on <corpus>
95 -F <document-frequency file>
96 -K <keyphrase-frequency file>
97 -M <Naive-Bayes model file>
98 -S <stopword file>
99See README for more detail.
100";
101}
102
103
104# Number of phrases to extract
105if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
106 $number_of_phrases = "-N $opt_N";
107 print STDERR "Number of phrases to extract: $opt_N\n";
108} elsif ($opt_N) {
109 die "Kea cannot understand -N argument (must be a number): $opt_N\n";
110} else {
111 print STDERR "Number of phrases dictated by model (default)\n";
112 $number_of_phrases = "";
113}
114
115# -L maximum phrase length
116$maximum_phrase_length = 0;
117if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
118 $maximum_phrase_length = "$opt_L";
119 print STDERR "Maximum phrase length: $opt_L\n";
120} elsif ($opt_L) {
121 die "Kea cannot understand -L argument (must be a number): $opt_L\n";
122}
123
124# -E What extension shall we use?
125$extension = "kea";
126if ($opt_E) {
127 $extension = $opt_E;
128 $extension =~ s/^\.//g;
129 print STDERR "Using output extension: $extension\n";
130}
131
132# -t Should we output tfidf? (This option is used by Kniles.)
133if ($opt_t && $opt_t) {
134 $output_tfidf = 1;
135 print STDERR "Do print tf.idf\n";
136} else {
137 $output_tfidf = 0;
138}
139
140
141# -C Corpus file stem
142$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
143if ($opt_C) {
144 print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
145 $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
146}
147
148# set the default model etc
149$model_file = "$default_stem.model";
150$stopword_file = "$default_stem.stopwords";
151$frequency_file = "$default_stem.df";
152$keyword_frequency_file = "";
153$keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
154
155
156# -F Document Frequency file
157$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F);
158print STDERR "Document frequency file: $frequency_file\n";
159die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
160
161# -M Model file
162$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M);
163print STDERR "Model file: $model_file\n";
164die "Model file does not exist!\n" if (!(-e $model_file));
165
166# -S Stopword file
167$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S);
168print STDERR "Stopword file: $stopword_file\n";
169die "Stopword file does not exist!\n" if (!(-e $stopword_file));
170
171# -K Keyword frequency file
172$keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K);
173if ($keyword_frequency_file) {
174 print STDERR "Keyword frequency file: $keyword_frequency_file\n";
175 die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file));
176} else {
177 print STDERR "No keyword frequency file (default)\n";
178}
179
180# Count the number of files
181$number_of_files = $#ARGV + 1;
182print STDERR "Number of files: $number_of_files\n\n";
183
184
185
186# Set up working files
187
188$stem = "$gsdlhome/tmp/kea.$$";
189
190$data = "$stem.data";
191$arff = "$stem.arff";
192$out = "$stem.out";
193$err = "$stem.err";
194
195`mkdir -m 777 $data`;
196
197
198# Process each input file into a working file
199print STDERR "Preparing input files in: $data\n";
200
201for ($f = 0; $f <= $#ARGV; $f++) {
202 print STDERR " document ", ($f+1), ": $ARGV[$f]\r";
203
204 $file = $ARGV[$f];
205 $temp = "$data/$f";
206 $original_filename{$temp} = $file;
207
208 # copy the file to the data directory & coerce into a clause file
209 if ($file =~ /.*\.clauses/) {
210 `cp $file $temp.clauses`;
211 } else {
212 if ($file =~ /.*\.te?xt/) {
213 `cp $file $temp.txt`;
214 } elsif ($file =~ /.*\.html?/i) {
215 `cp $file $temp.html`;
216 `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
217 } elsif ($file =~ /.*\.cstr/) {
218 `cp $file $temp.cstr`;
219 `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
220 } else {
221 die "Unknown file type: $file\n";
222 }
223 # prepare the file
224 `$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
225 }
226}
227#print STDERR "\n\n";
228
229# Build the arff file
230$command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
231$command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
232$command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
233$command .= " $data $arff";
234
235#print STDERR "** $command **\n";
236`$command`;
237
238
239# use KEP.java to extract phrases
240$command = "$java_command KEP -m $model_file -T $arff";
241$command .= " $number_of_phrases" if ($number_of_phrases);
242# -R is correctly set by default in the java file when the model is loaded
243# $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
244$command .= " > $out 2> $err";
245
246print STDERR "** $command **\n";
247`$command`;
248
249
250
251# Read output file and create .kea files
252
253open(KEP, "<$out");
254
255$document = "";
256
257while (<KEP>) {
258
259 chomp;
260 $line = $_;
261
262 # new document
263 if ($line =~ "^Current document") {
264 # close the old document
265 if ($document) {
266 close(DOC);
267 }
268 # open start the new document
269 ($doc) = $line =~ /Current document: (.*)\.clauses$/;
270 $document = $original_filename{$doc};
271 $document =~ s/\.[^\.]+$//;
272 $document .= ".$extension";
273 #print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
274 open(DOC, ">$document");
275
276 } elsif ($line =~ "^Miss:") {
277 die "Trying to write with no current document!" if (!$document);
278 ($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
279 ($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
280 if ($output_tfidf) {
281 print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
282 } else {
283 print DOC "$unstemmed\t$stemmed\t$evidence\n";
284 }
285 }
286}
287
288
289#get rid of temporary files
290#if (!$opt_d && !$opt_d) {
291# `rm -r $data $arff $out $err`;
292#}
293
294
295
296
297
298
299
Note: See TracBrowser for help on using the repository browser.