Context Navigation

source: trunk/gsdl/perllib/Kea-1.1.4/Kea@ 9156

Last change on this file since 9156 was 5106, checked in by jrm21, 21 years ago
need to chomp output of `which java`. $opt_d should be "local", not "my" changed some backticks to system commands.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.9 KB

Line
1	#!/usr/bin/perl -w
2
3	# Kea
4	# Version 1.1.4
5
6	# Kea -- Automatic Keyphrase Extraction
7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8	# Contact [email protected] or [email protected]
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24	# Version history
25	#
26	# 1.0 Witten et.al.
27	# 1.0.1 Bug: stopword file loaded as model file
28	# 1.0.2 Java paths explicit for nikau; JIT compiler
29	# 1.0.3 Include tf.idf in output if -t is set
30	# 1.0.4 Allow optional keyphrase frequency file
31	# 1.0.5 Use $perl_command and $java_command for system-indepence
32	# 1.0.6 -C argument selects model, stopword file, df file
33	# 1.0.7 Changes to Kea.pl.
34	# This is Phillip's version
35	# 1.0.8 Accepts .htm as well as .html
36	# 1.0.9 Accepts .text as well as .txt
37	# 1.1 First Distribution. GPL added. Documentation added.
38	# 1.1.1 -E argument sets output extension (default is .kea)
39	# 1.1.2 Documented java variables
40	# Maximum phrase length can be set at command-line.
41	# Note: default=3; NOT the length for the model. Sorry.
42	# 1.1.3 Moved Lynx command into separate script that checks
43	# for circumstances that are likely to crash it.
44	# 1.1.4 Updated documentation and added a few extra files.
45
46	print STDERR "Kea (version 1.1.4): automatic keyphrase extraction\n";
47
48	$gsdlhome = $ENV{'GSDLHOME'};
49
50	if (! -x "$gsdlhome/perllib/Kea-1.1.4/stemmer") {
51	print STDERR " need to compile stemmer \n";
52	system("gcc", ("-o", "$gsdlhome/perllib/Kea-1.1.4/stemmer",
53	"$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c",
54	"$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c"
55	));
56	}
57
58	# Kea runs other perl scripts in shells.
59	$perl_command = "perl -w";
60
61	# Java is a bit more difficult.
62	# set this variable to your java home directory
63	#$java_home = "/usr/local/jdk";
64	$java_home = "/usr/local/share/java";
65
66	# this variable will hold the CLASSPATH for java which we set at
67	# the command line to incorporate jaws.jar; you shouldn't need to change it
68	$java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
69
70	# The name of your java just-in-time compiler. I use TYA.
71	# An empty string means you don;t have a JIT compiler.
72	$java_JIT_compiler = "";
73	#$java_JIT_compiler = "tya";
74
75	# If you want to give java lots of memory or use other arguments,
76	# use this variable, otherwise make it an empty string.
77	$java_extra_args = "";
78	$java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
79
80	# see if java executable is on path
81	my $java_exec="";
82	if (system("which java >/dev/null 2>/dev/null")==0) {
83	$java_exec=`which java`;
84	chomp $java_exec;
85	} else {
86	$java_exec="$java_home/bin/java";
87	}
88
89	# The actual java command is based on these other variables:
90	$java_command = "$java_exec -classpath \"$java_classpath\"";
91	$java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
92	$java_command .= " $java_extra_args" if ($java_extra_args);
93
94
95	# Parse command line options
96	use Getopt::Std;
97	local $opt_d=0; # debug option
98	getopts("dtN:E:C:F:K:L:M:S:");
99
100	# What files shall we use?
101	if (!$ARGV[0]) {
102	die "Usage: Kea [options] text-or-html-or-cstr-files
103	Options:
104	-d Debug mode
105	-t Ouput TF.IDF
106	-N n Output n keyphrases
107	-L n Maximum phrases length is n (default = 3)
108	-E <suffix> Output extension is <suffix>
109	-C <corpus> Use model/df/kf/stopwords based on <corpus>
110	-F <document-frequency file>
111	-K <keyphrase-frequency file>
112	-M <Naive-Bayes model file>
113	-S <stopword file>
114	See README for more detail.
115	";
116	}
117
118
119	# Number of phrases to extract
120	if (defined($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
121	$number_of_phrases = "-N $opt_N";
122	print STDERR "Number of phrases to extract: $opt_N\n";
123	} elsif (defined($opt_N)) {
124	die "Kea cannot understand -N argument (must be a number): $opt_N\n";
125	} else {
126	if ($opt_d) {
127	print STDERR "Number of phrases dictated by model (default)\n";
128	}
129	$number_of_phrases = "";
130	}
131
132	# -L maximum phrase length
133	$maximum_phrase_length = 0;
134	if (defined($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
135	$maximum_phrase_length = "$opt_L";
136	print STDERR "Maximum phrase length: $opt_L\n";
137	} elsif (defined($opt_L)) {
138	die "Kea cannot understand -L argument (must be a number): $opt_L\n";
139	}
140
141	# -E What extension shall we use?
142	$extension = "kea";
143	if (defined($opt_E)) {
144	$extension = $opt_E;
145	$extension =~ s/^\.//g;
146	print STDERR "Using output extension: $extension\n";
147	}
148
149	# -t Should we output tfidf? (This option is used by Kniles.)
150	if (defined($opt_t) && $opt_t) {
151	$output_tfidf = 1;
152	print STDERR "Do print tf.idf\n";
153	} else {
154	$output_tfidf = 0;
155	}
156
157
158	# -C Corpus file stem
159	$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
160	if (defined($opt_C)) {
161	print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
162	$default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
163	}
164
165	# set the default model etc
166	$model_file = "$default_stem.model";
167	$stopword_file = "$default_stem.stopwords";
168	$frequency_file = "$default_stem.df";
169	$keyword_frequency_file = "";
170	$keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
171
172
173	# -F Document Frequency file
174	if (defined($opt_F)) {$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F";}
175	if ($opt_d) {print STDERR "Document frequency file: $frequency_file\n";}
176	die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
177
178	# -M Model file
179	if (defined($opt_M)) {$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M";}
180	if ($opt_d) {print STDERR "Model file: $model_file\n";}
181	die "Model file does not exist!\n" if (!(-e $model_file));
182
183	# -S Stopword file
184	if (defined($opt_S)) {$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S";}
185	if ($opt_d) {print STDERR "Stopword file: $stopword_file\n";}
186	die "Stopword file does not exist!\n" if (!(-e $stopword_file));
187
188	# -K Keyword frequency file
189	if (defined($opt_K)) {
190	$keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K";
191	}
192	if ($keyword_frequency_file) {
193	print STDERR "Keyword frequency file: $keyword_frequency_file\n";
194	die "Keyword frequency file does not exist!\n" if (! -e $keyword_frequency_file);
195	} else {
196	if ($opt_d) {print STDERR "No keyword frequency file (default)\n";}
197	}
198
199	# Count the number of files
200	$number_of_files = $#ARGV + 1;
201	if ($opt_d) {print STDERR "Number of files: $number_of_files\n\n";}
202
203
204
205	# Set up working files
206
207	$stem = "$gsdlhome/tmp/kea.$$";
208
209	$data = "$stem.data";
210	$arff = "$stem.arff";
211	$out = "$stem.out";
212	$err = "$stem.err";
213
214	`mkdir -m 777 $data`;
215
216
217	# Process each input file into a working file
218	if ($opt_d) {print STDERR "Preparing input files in: $data\n";}
219
220	for ($f = 0; $f <= $#ARGV; $f++) {
221	print STDERR " document ", ($f+1), ": $ARGV[$f]\n";
222
223	$file = $ARGV[$f];
224	$temp = "$data/$f";
225	$temp =~ s@//@/@g;
226	$original_filename{$temp} = $file;
227
228	# copy the file to the data directory & coerce into a clause file
229	if ($file =~ /.*\.clauses/i) {
230	system("cp","$file","$temp.clauses");
231	} else {
232	if ($file =~ /.*\.te?xt/i) {
233	system("cp","$file","$temp.txt");
234	} elsif ($file =~ /.*\.html?/i) {
235	system("cp", "$file", "$temp.html");
236	`$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
237	} elsif ($file =~ /.*\.cstr/) {
238	`cp $file $temp.cstr`;
239	`$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
240	} else {
241	die "Unknown file type: $file\n";
242	}
243	# prepare the file
244	`$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
245	}
246	}
247	#print STDERR "\n\n";
248
249	# Build the arff file
250	$command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
251	$command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
252	$command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
253	$command .= " $data $arff";
254
255	if ($opt_d) {
256	print STDERR " $command \n";
257	}
258	system ("$command");
259
260
261	# use KEP.java to extract phrases
262	$command = "$java_command KEP -m $model_file -T $arff";
263	$command .= " $number_of_phrases" if ($number_of_phrases);
264	# -R is correctly set by default in the java file when the model is loaded
265	# $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
266	$command .= " > $out 2> $err";
267
268	if ($opt_d) {
269	print STDERR " $command \n";
270	}
271	system ("$command");
272
273
274
275	# Read output file and create .kea files
276
277	open(KEP, "<$out");
278
279	$document = "";
280
281	while (<KEP>) {
282
283	chomp;
284	$line = $_;
285
286	# new document
287	if ($line =~ "^Current document") {
288	# close the old document
289	if ($document) {
290	close(DOC);
291	}
292	# open start the new document
293	($doc) = $line =~ /Current document: (.*)\.clauses$/;
294	$doc =~ s@//@/@g;
295	$document = $original_filename{$doc};
296	$document =~ s/\.[^\.]+$//;
297	$document .= ".$extension";
298	#print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
299	open(DOC, ">$document");
300
301	} elsif ($line =~ "^Miss:") {
302	die "Trying to write with no current document!" if (!$document);
303	($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
304	($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
305	if ($output_tfidf) {
306	print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
307	} else {
308	print DOC "$unstemmed\t$stemmed\t$evidence\n";
309	}
310	}
311	}
312
313
314	#get rid of temporary files
315	if (!$opt_d) {
316	system ("rm", ("-r", "$data", "$arff", "$out", "$err"));
317	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: