Context Navigation

source: trunk/gsdl/perllib/Kea-1.1.4/Kea@ 3161

Last change on this file since 3161 was 1995, checked in by jmt14, 23 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.2 KB

Line
1	#!/usr/bin/perl -w
2
3	# Kea
4	# Version 1.1.4
5
6	# Kea -- Automatic Keyphrase Extraction
7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8	# Contact [email protected] or [email protected]
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24	# Version history
25	#
26	# 1.0 Witten et.al.
27	# 1.0.1 Bug: stopword file loaded as model file
28	# 1.0.2 Java paths explicit for nikau; JIT compiler
29	# 1.0.3 Include tf.idf in output if -t is set
30	# 1.0.4 Allow optional keyphrase frequency file
31	# 1.0.5 Use $perl_command and $java_command for system-indepence
32	# 1.0.6 -C argument selects model, stopword file, df file
33	# 1.0.7 Changes to Kea.pl.
34	# This is Phillip's version
35	# 1.0.8 Accepts .htm as well as .html
36	# 1.0.9 Accepts .text as well as .txt
37	# 1.1 First Distribution. GPL added. Documentation added.
38	# 1.1.1 -E argument sets output extension (default is .kea)
39	# 1.1.2 Documented java variables
40	# Maximum phrase length can be set at command-line.
41	# Note: default=3; NOT the length for the model. Sorry.
42	# 1.1.3 Moved Lynx command into separate script that checks
43	# for circumstances that are likely to crash it.
44	# 1.1.4 Updated documentation and added a few extra files.
45
46	print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n";
47
48	$gsdlhome = $ENV{'GSDLHOME'};
49
50	`gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`;
51
52
53	# Kea runs other perl scripts in shells.
54	$perl_command = "perl -w";
55
56	# Java is a bit more difficult.
57	# set this variable to your java home directory
58	#$java_home = "/usr/local/jdk";
59	$java_home = "/usr/local/share/java";
60
61	# this variable will hold the CLASSPATH for java which we set at
62	# the command line to incorporate jaws.jar; you shouldn't need to change it
63	$java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
64
65	# The name of your java just-in-time compiler. I use TYA.
66	# An empty string means you don;t have a JIT compiler.
67	$java_JIT_compiler = "";
68	#$java_JIT_compiler = "tya";
69
70	# If you want to give java lots of memory or use other arguments,
71	# use this variable, otherwise make it an empty string.
72	$java_extra_args = "";
73	$java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
74
75	# The actual java command is based on these other variables:
76	$java_command = "$java_home/bin/java -classpath \"$java_classpath\"";
77	$java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
78	$java_command .= " $java_extra_args" if ($java_extra_args);
79
80
81	# Parse command line options
82	require("getopts.pl");
83	&Getopts("dtN:E:C:F:K:L:M:S:");
84
85	# What files shall we use?
86	if (!$ARGV[0]) {
87	die "Usage: Kea [options] text-or-html-or-cstr-files
88	Options:
89	-d Debug mode
90	-t Ouput TF.IDF
91	-N n Output n keyphrases
92	-L n Maximum phrases length is n (default = 3)
93	-E <suffix> Output extension is <suffix>
94	-C <corpus> Use model/df/kf/stopwords based on <corpus>
95	-F <document-frequency file>
96	-K <keyphrase-frequency file>
97	-M <Naive-Bayes model file>
98	-S <stopword file>
99	See README for more detail.
100	";
101	}
102
103
104	# Number of phrases to extract
105	if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
106	$number_of_phrases = "-N $opt_N";
107	print STDERR "Number of phrases to extract: $opt_N\n";
108	} elsif ($opt_N) {
109	die "Kea cannot understand -N argument (must be a number): $opt_N\n";
110	} else {
111	print STDERR "Number of phrases dictated by model (default)\n";
112	$number_of_phrases = "";
113	}
114
115	# -L maximum phrase length
116	$maximum_phrase_length = 0;
117	if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
118	$maximum_phrase_length = "$opt_L";
119	print STDERR "Maximum phrase length: $opt_L\n";
120	} elsif ($opt_L) {
121	die "Kea cannot understand -L argument (must be a number): $opt_L\n";
122	}
123
124	# -E What extension shall we use?
125	$extension = "kea";
126	if ($opt_E) {
127	$extension = $opt_E;
128	$extension =~ s/^\.//g;
129	print STDERR "Using output extension: $extension\n";
130	}
131
132	# -t Should we output tfidf? (This option is used by Kniles.)
133	if ($opt_t && $opt_t) {
134	$output_tfidf = 1;
135	print STDERR "Do print tf.idf\n";
136	} else {
137	$output_tfidf = 0;
138	}
139
140
141	# -C Corpus file stem
142	$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
143	if ($opt_C) {
144	print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
145	$default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
146	}
147
148	# set the default model etc
149	$model_file = "$default_stem.model";
150	$stopword_file = "$default_stem.stopwords";
151	$frequency_file = "$default_stem.df";
152	$keyword_frequency_file = "";
153	$keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
154
155
156	# -F Document Frequency file
157	$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F);
158	print STDERR "Document frequency file: $frequency_file\n";
159	die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
160
161	# -M Model file
162	$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M);
163	print STDERR "Model file: $model_file\n";
164	die "Model file does not exist!\n" if (!(-e $model_file));
165
166	# -S Stopword file
167	$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S);
168	print STDERR "Stopword file: $stopword_file\n";
169	die "Stopword file does not exist!\n" if (!(-e $stopword_file));
170
171	# -K Keyword frequency file
172	$keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K);
173	if ($keyword_frequency_file) {
174	print STDERR "Keyword frequency file: $keyword_frequency_file\n";
175	die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file));
176	} else {
177	print STDERR "No keyword frequency file (default)\n";
178	}
179
180	# Count the number of files
181	$number_of_files = $#ARGV + 1;
182	print STDERR "Number of files: $number_of_files\n\n";
183
184
185
186	# Set up working files
187
188	$stem = "$gsdlhome/tmp/kea.$$";
189
190	$data = "$stem.data";
191	$arff = "$stem.arff";
192	$out = "$stem.out";
193	$err = "$stem.err";
194
195	`mkdir -m 777 $data`;
196
197
198	# Process each input file into a working file
199	print STDERR "Preparing input files in: $data\n";
200
201	for ($f = 0; $f <= $#ARGV; $f++) {
202	print STDERR " document ", ($f+1), ": $ARGV[$f]\r";
203
204	$file = $ARGV[$f];
205	$temp = "$data/$f";
206	$original_filename{$temp} = $file;
207
208	# copy the file to the data directory & coerce into a clause file
209	if ($file =~ /.*\.clauses/) {
210	`cp $file $temp.clauses`;
211	} else {
212	if ($file =~ /.*\.te?xt/) {
213	`cp $file $temp.txt`;
214	} elsif ($file =~ /.*\.html?/i) {
215	`cp $file $temp.html`;
216	`$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
217	} elsif ($file =~ /.*\.cstr/) {
218	`cp $file $temp.cstr`;
219	`$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
220	} else {
221	die "Unknown file type: $file\n";
222	}
223	# prepare the file
224	`$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
225	}
226	}
227	#print STDERR "\n\n";
228
229	# Build the arff file
230	$command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
231	$command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
232	$command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
233	$command .= " $data $arff";
234
235	#print STDERR " $command \n";
236	`$command`;
237
238
239	# use KEP.java to extract phrases
240	$command = "$java_command KEP -m $model_file -T $arff";
241	$command .= " $number_of_phrases" if ($number_of_phrases);
242	# -R is correctly set by default in the java file when the model is loaded
243	# $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
244	$command .= " > $out 2> $err";
245
246	print STDERR " $command \n";
247	`$command`;
248
249
250
251	# Read output file and create .kea files
252
253	open(KEP, "<$out");
254
255	$document = "";
256
257	while (<KEP>) {
258
259	chomp;
260	$line = $_;
261
262	# new document
263	if ($line =~ "^Current document") {
264	# close the old document
265	if ($document) {
266	close(DOC);
267	}
268	# open start the new document
269	($doc) = $line =~ /Current document: (.*)\.clauses$/;
270	$document = $original_filename{$doc};
271	$document =~ s/\.[^\.]+$//;
272	$document .= ".$extension";
273	#print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
274	open(DOC, ">$document");
275
276	} elsif ($line =~ "^Miss:") {
277	die "Trying to write with no current document!" if (!$document);
278	($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
279	($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
280	if ($output_tfidf) {
281	print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
282	} else {
283	print DOC "$unstemmed\t$stemmed\t$evidence\n";
284	}
285	}
286	}
287
288
289	#get rid of temporary files
290	#if (!$opt_d && !$opt_d) {
291	# `rm -r $data $arff $out $err`;
292	#}
293
294
295
296
297
298
299

Note: See TracBrowser for help on using the repository browser.

Download in other formats: