source: trunk/gsdl/perllib/Kea-1.1.4/Kea@ 9156

Last change on this file since 9156 was 5106, checked in by jrm21, 21 years ago

need to chomp output of which java.
$opt_d should be "local", not "my"
changed some backticks to system commands.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1#!/usr/bin/perl -w
2
3# Kea
4# Version 1.1.4
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Witten et.al.
27# 1.0.1 Bug: stopword file loaded as model file
28# 1.0.2 Java paths explicit for nikau; JIT compiler
29# 1.0.3 Include tf.idf in output if -t is set
30# 1.0.4 Allow optional keyphrase frequency file
31# 1.0.5 Use $perl_command and $java_command for system-indepence
32# 1.0.6 -C argument selects model, stopword file, df file
33# 1.0.7 Changes to Kea.pl.
34# This is Phillip's version
35# 1.0.8 Accepts .htm as well as .html
36# 1.0.9 Accepts .text as well as .txt
37# 1.1 First Distribution. GPL added. Documentation added.
38# 1.1.1 -E argument sets output extension (default is .kea)
39# 1.1.2 Documented java variables
40# Maximum phrase length can be set at command-line.
41# Note: default=3; NOT the length for the model. Sorry.
42# 1.1.3 Moved Lynx command into separate script that checks
43# for circumstances that are likely to crash it.
44# 1.1.4 Updated documentation and added a few extra files.
45
46print STDERR "Kea (version 1.1.4): automatic keyphrase extraction\n";
47
48$gsdlhome = $ENV{'GSDLHOME'};
49
50if (! -x "$gsdlhome/perllib/Kea-1.1.4/stemmer") {
51 print STDERR "** need to compile stemmer **\n";
52 system("gcc", ("-o", "$gsdlhome/perllib/Kea-1.1.4/stemmer",
53 "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c",
54 "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c"
55 ));
56}
57
58# Kea runs other perl scripts in shells.
59$perl_command = "perl -w";
60
61# Java is a bit more difficult.
62# set this variable to your java home directory
63#$java_home = "/usr/local/jdk";
64$java_home = "/usr/local/share/java";
65
66# this variable will hold the CLASSPATH for java which we set at
67# the command line to incorporate jaws.jar; you shouldn't need to change it
68$java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
69
70# The name of your java just-in-time compiler. I use TYA.
71# An empty string means you don;t have a JIT compiler.
72$java_JIT_compiler = "";
73#$java_JIT_compiler = "tya";
74
75# If you want to give java lots of memory or use other arguments,
76# use this variable, otherwise make it an empty string.
77$java_extra_args = "";
78$java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
79
80# see if java executable is on path
81my $java_exec="";
82if (system("which java >/dev/null 2>/dev/null")==0) {
83 $java_exec=`which java`;
84 chomp $java_exec;
85} else {
86 $java_exec="$java_home/bin/java";
87}
88
89# The actual java command is based on these other variables:
90$java_command = "$java_exec -classpath \"$java_classpath\"";
91$java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
92$java_command .= " $java_extra_args" if ($java_extra_args);
93
94
95# Parse command line options
96use Getopt::Std;
97local $opt_d=0; # debug option
98getopts("dtN:E:C:F:K:L:M:S:");
99
100# What files shall we use?
101if (!$ARGV[0]) {
102 die "Usage: Kea [options] text-or-html-or-cstr-files
103Options:
104 -d Debug mode
105 -t Ouput TF.IDF
106 -N n Output n keyphrases
107 -L n Maximum phrases length is n (default = 3)
108 -E <suffix> Output extension is <suffix>
109 -C <corpus> Use model/df/kf/stopwords based on <corpus>
110 -F <document-frequency file>
111 -K <keyphrase-frequency file>
112 -M <Naive-Bayes model file>
113 -S <stopword file>
114See README for more detail.
115";
116}
117
118
119# Number of phrases to extract
120if (defined($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
121 $number_of_phrases = "-N $opt_N";
122 print STDERR "Number of phrases to extract: $opt_N\n";
123} elsif (defined($opt_N)) {
124 die "Kea cannot understand -N argument (must be a number): $opt_N\n";
125} else {
126 if ($opt_d) {
127 print STDERR "Number of phrases dictated by model (default)\n";
128 }
129 $number_of_phrases = "";
130}
131
132# -L maximum phrase length
133$maximum_phrase_length = 0;
134if (defined($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
135 $maximum_phrase_length = "$opt_L";
136 print STDERR "Maximum phrase length: $opt_L\n";
137} elsif (defined($opt_L)) {
138 die "Kea cannot understand -L argument (must be a number): $opt_L\n";
139}
140
141# -E What extension shall we use?
142$extension = "kea";
143if (defined($opt_E)) {
144 $extension = $opt_E;
145 $extension =~ s/^\.//g;
146 print STDERR "Using output extension: $extension\n";
147}
148
149# -t Should we output tfidf? (This option is used by Kniles.)
150if (defined($opt_t) && $opt_t) {
151 $output_tfidf = 1;
152 print STDERR "Do print tf.idf\n";
153} else {
154 $output_tfidf = 0;
155}
156
157
158# -C Corpus file stem
159$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
160if (defined($opt_C)) {
161 print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
162 $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
163}
164
165# set the default model etc
166$model_file = "$default_stem.model";
167$stopword_file = "$default_stem.stopwords";
168$frequency_file = "$default_stem.df";
169$keyword_frequency_file = "";
170$keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
171
172
173# -F Document Frequency file
174if (defined($opt_F)) {$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F";}
175if ($opt_d) {print STDERR "Document frequency file: $frequency_file\n";}
176die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
177
178# -M Model file
179if (defined($opt_M)) {$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M";}
180if ($opt_d) {print STDERR "Model file: $model_file\n";}
181die "Model file does not exist!\n" if (!(-e $model_file));
182
183# -S Stopword file
184if (defined($opt_S)) {$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S";}
185if ($opt_d) {print STDERR "Stopword file: $stopword_file\n";}
186die "Stopword file does not exist!\n" if (!(-e $stopword_file));
187
188# -K Keyword frequency file
189if (defined($opt_K)) {
190 $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K";
191}
192if ($keyword_frequency_file) {
193 print STDERR "Keyword frequency file: $keyword_frequency_file\n";
194 die "Keyword frequency file does not exist!\n" if (! -e $keyword_frequency_file);
195} else {
196 if ($opt_d) {print STDERR "No keyword frequency file (default)\n";}
197}
198
199# Count the number of files
200$number_of_files = $#ARGV + 1;
201if ($opt_d) {print STDERR "Number of files: $number_of_files\n\n";}
202
203
204
205# Set up working files
206
207$stem = "$gsdlhome/tmp/kea.$$";
208
209$data = "$stem.data";
210$arff = "$stem.arff";
211$out = "$stem.out";
212$err = "$stem.err";
213
214`mkdir -m 777 $data`;
215
216
217# Process each input file into a working file
218if ($opt_d) {print STDERR "Preparing input files in: $data\n";}
219
220for ($f = 0; $f <= $#ARGV; $f++) {
221 print STDERR " document ", ($f+1), ": $ARGV[$f]\n";
222
223 $file = $ARGV[$f];
224 $temp = "$data/$f";
225 $temp =~ s@//@/@g;
226 $original_filename{$temp} = $file;
227
228 # copy the file to the data directory & coerce into a clause file
229 if ($file =~ /.*\.clauses/i) {
230 system("cp","$file","$temp.clauses");
231 } else {
232 if ($file =~ /.*\.te?xt/i) {
233 system("cp","$file","$temp.txt");
234 } elsif ($file =~ /.*\.html?/i) {
235 system("cp", "$file", "$temp.html");
236 `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
237 } elsif ($file =~ /.*\.cstr/) {
238 `cp $file $temp.cstr`;
239 `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
240 } else {
241 die "Unknown file type: $file\n";
242 }
243 # prepare the file
244 `$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
245 }
246}
247#print STDERR "\n\n";
248
249# Build the arff file
250$command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
251$command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
252$command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
253$command .= " $data $arff";
254
255if ($opt_d) {
256 print STDERR "** $command **\n";
257}
258system ("$command");
259
260
261# use KEP.java to extract phrases
262$command = "$java_command KEP -m $model_file -T $arff";
263$command .= " $number_of_phrases" if ($number_of_phrases);
264# -R is correctly set by default in the java file when the model is loaded
265# $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
266$command .= " > $out 2> $err";
267
268if ($opt_d) {
269 print STDERR "** $command **\n";
270}
271system ("$command");
272
273
274
275# Read output file and create .kea files
276
277open(KEP, "<$out");
278
279$document = "";
280
281while (<KEP>) {
282
283 chomp;
284 $line = $_;
285
286 # new document
287 if ($line =~ "^Current document") {
288 # close the old document
289 if ($document) {
290 close(DOC);
291 }
292 # open start the new document
293 ($doc) = $line =~ /Current document: (.*)\.clauses$/;
294 $doc =~ s@//@/@g;
295 $document = $original_filename{$doc};
296 $document =~ s/\.[^\.]+$//;
297 $document .= ".$extension";
298 #print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
299 open(DOC, ">$document");
300
301 } elsif ($line =~ "^Miss:") {
302 die "Trying to write with no current document!" if (!$document);
303 ($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
304 ($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
305 if ($output_tfidf) {
306 print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
307 } else {
308 print DOC "$unstemmed\t$stemmed\t$evidence\n";
309 }
310 }
311}
312
313
314#get rid of temporary files
315if (!$opt_d) {
316 system ("rm", ("-r", "$data", "$arff", "$out", "$err"));
317}
Note: See TracBrowser for help on using the repository browser.