1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | # Kea
|
---|
4 | # Version 1.1.4
|
---|
5 |
|
---|
6 | # Kea -- Automatic Keyphrase Extraction
|
---|
7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
8 | # Contact [email protected] or [email protected]
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 |
|
---|
24 | # Version history
|
---|
25 | #
|
---|
26 | # 1.0 Witten et.al.
|
---|
27 | # 1.0.1 Bug: stopword file loaded as model file
|
---|
28 | # 1.0.2 Java paths explicit for nikau; JIT compiler
|
---|
29 | # 1.0.3 Include tf.idf in output if -t is set
|
---|
30 | # 1.0.4 Allow optional keyphrase frequency file
|
---|
31 | # 1.0.5 Use $perl_command and $java_command for system-indepence
|
---|
32 | # 1.0.6 -C argument selects model, stopword file, df file
|
---|
33 | # 1.0.7 Changes to Kea.pl.
|
---|
34 | # This is Phillip's version
|
---|
35 | # 1.0.8 Accepts .htm as well as .html
|
---|
36 | # 1.0.9 Accepts .text as well as .txt
|
---|
37 | # 1.1 First Distribution. GPL added. Documentation added.
|
---|
38 | # 1.1.1 -E argument sets output extension (default is .kea)
|
---|
39 | # 1.1.2 Documented java variables
|
---|
40 | # Maximum phrase length can be set at command-line.
|
---|
41 | # Note: default=3; NOT the length for the model. Sorry.
|
---|
42 | # 1.1.3 Moved Lynx command into separate script that checks
|
---|
43 | # for circumstances that are likely to crash it.
|
---|
44 | # 1.1.4 Updated documentation and added a few extra files.
|
---|
45 |
|
---|
46 | print STDERR "Kea (version 1.1.4): automatic keyphrase extraction\n";
|
---|
47 |
|
---|
48 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
49 |
|
---|
50 | if (! -x "$gsdlhome/perllib/Kea-1.1.4/stemmer") {
|
---|
51 | print STDERR "** need to compile stemmer **\n";
|
---|
52 | system("gcc", ("-o", "$gsdlhome/perllib/Kea-1.1.4/stemmer",
|
---|
53 | "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c",
|
---|
54 | "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c"
|
---|
55 | ));
|
---|
56 | }
|
---|
57 |
|
---|
58 | # Kea runs other perl scripts in shells.
|
---|
59 | $perl_command = "perl -w";
|
---|
60 |
|
---|
61 | # Java is a bit more difficult.
|
---|
62 | # set this variable to your java home directory
|
---|
63 | #$java_home = "/usr/local/jdk";
|
---|
64 | $java_home = "/usr/local/share/java";
|
---|
65 |
|
---|
66 | # this variable will hold the CLASSPATH for java which we set at
|
---|
67 | # the command line to incorporate jaws.jar; you shouldn't need to change it
|
---|
68 | $java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
|
---|
69 |
|
---|
70 | # The name of your java just-in-time compiler. I use TYA.
|
---|
71 | # An empty string means you don;t have a JIT compiler.
|
---|
72 | $java_JIT_compiler = "";
|
---|
73 | #$java_JIT_compiler = "tya";
|
---|
74 |
|
---|
75 | # If you want to give java lots of memory or use other arguments,
|
---|
76 | # use this variable, otherwise make it an empty string.
|
---|
77 | $java_extra_args = "";
|
---|
78 | $java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
|
---|
79 |
|
---|
80 | # see if java executable is on path
|
---|
81 | my $java_exec="";
|
---|
82 | if (system("which java >/dev/null 2>/dev/null")==0) {
|
---|
83 | $java_exec=`which java`;
|
---|
84 | chomp $java_exec;
|
---|
85 | } else {
|
---|
86 | $java_exec="$java_home/bin/java";
|
---|
87 | }
|
---|
88 |
|
---|
89 | # The actual java command is based on these other variables:
|
---|
90 | $java_command = "$java_exec -classpath \"$java_classpath\"";
|
---|
91 | $java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
|
---|
92 | $java_command .= " $java_extra_args" if ($java_extra_args);
|
---|
93 |
|
---|
94 |
|
---|
95 | # Parse command line options
|
---|
96 | use Getopt::Std;
|
---|
97 | local $opt_d=0; # debug option
|
---|
98 | getopts("dtN:E:C:F:K:L:M:S:");
|
---|
99 |
|
---|
100 | # What files shall we use?
|
---|
101 | if (!$ARGV[0]) {
|
---|
102 | die "Usage: Kea [options] text-or-html-or-cstr-files
|
---|
103 | Options:
|
---|
104 | -d Debug mode
|
---|
105 | -t Ouput TF.IDF
|
---|
106 | -N n Output n keyphrases
|
---|
107 | -L n Maximum phrases length is n (default = 3)
|
---|
108 | -E <suffix> Output extension is <suffix>
|
---|
109 | -C <corpus> Use model/df/kf/stopwords based on <corpus>
|
---|
110 | -F <document-frequency file>
|
---|
111 | -K <keyphrase-frequency file>
|
---|
112 | -M <Naive-Bayes model file>
|
---|
113 | -S <stopword file>
|
---|
114 | See README for more detail.
|
---|
115 | ";
|
---|
116 | }
|
---|
117 |
|
---|
118 |
|
---|
119 | # Number of phrases to extract
|
---|
120 | if (defined($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
|
---|
121 | $number_of_phrases = "-N $opt_N";
|
---|
122 | print STDERR "Number of phrases to extract: $opt_N\n";
|
---|
123 | } elsif (defined($opt_N)) {
|
---|
124 | die "Kea cannot understand -N argument (must be a number): $opt_N\n";
|
---|
125 | } else {
|
---|
126 | if ($opt_d) {
|
---|
127 | print STDERR "Number of phrases dictated by model (default)\n";
|
---|
128 | }
|
---|
129 | $number_of_phrases = "";
|
---|
130 | }
|
---|
131 |
|
---|
132 | # -L maximum phrase length
|
---|
133 | $maximum_phrase_length = 0;
|
---|
134 | if (defined($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
|
---|
135 | $maximum_phrase_length = "$opt_L";
|
---|
136 | print STDERR "Maximum phrase length: $opt_L\n";
|
---|
137 | } elsif (defined($opt_L)) {
|
---|
138 | die "Kea cannot understand -L argument (must be a number): $opt_L\n";
|
---|
139 | }
|
---|
140 |
|
---|
141 | # -E What extension shall we use?
|
---|
142 | $extension = "kea";
|
---|
143 | if (defined($opt_E)) {
|
---|
144 | $extension = $opt_E;
|
---|
145 | $extension =~ s/^\.//g;
|
---|
146 | print STDERR "Using output extension: $extension\n";
|
---|
147 | }
|
---|
148 |
|
---|
149 | # -t Should we output tfidf? (This option is used by Kniles.)
|
---|
150 | if (defined($opt_t) && $opt_t) {
|
---|
151 | $output_tfidf = 1;
|
---|
152 | print STDERR "Do print tf.idf\n";
|
---|
153 | } else {
|
---|
154 | $output_tfidf = 0;
|
---|
155 | }
|
---|
156 |
|
---|
157 |
|
---|
158 | # -C Corpus file stem
|
---|
159 | $default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
|
---|
160 | if (defined($opt_C)) {
|
---|
161 | print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
|
---|
162 | $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
|
---|
163 | }
|
---|
164 |
|
---|
165 | # set the default model etc
|
---|
166 | $model_file = "$default_stem.model";
|
---|
167 | $stopword_file = "$default_stem.stopwords";
|
---|
168 | $frequency_file = "$default_stem.df";
|
---|
169 | $keyword_frequency_file = "";
|
---|
170 | $keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
|
---|
171 |
|
---|
172 |
|
---|
173 | # -F Document Frequency file
|
---|
174 | if (defined($opt_F)) {$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F";}
|
---|
175 | if ($opt_d) {print STDERR "Document frequency file: $frequency_file\n";}
|
---|
176 | die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
|
---|
177 |
|
---|
178 | # -M Model file
|
---|
179 | if (defined($opt_M)) {$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M";}
|
---|
180 | if ($opt_d) {print STDERR "Model file: $model_file\n";}
|
---|
181 | die "Model file does not exist!\n" if (!(-e $model_file));
|
---|
182 |
|
---|
183 | # -S Stopword file
|
---|
184 | if (defined($opt_S)) {$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S";}
|
---|
185 | if ($opt_d) {print STDERR "Stopword file: $stopword_file\n";}
|
---|
186 | die "Stopword file does not exist!\n" if (!(-e $stopword_file));
|
---|
187 |
|
---|
188 | # -K Keyword frequency file
|
---|
189 | if (defined($opt_K)) {
|
---|
190 | $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K";
|
---|
191 | }
|
---|
192 | if ($keyword_frequency_file) {
|
---|
193 | print STDERR "Keyword frequency file: $keyword_frequency_file\n";
|
---|
194 | die "Keyword frequency file does not exist!\n" if (! -e $keyword_frequency_file);
|
---|
195 | } else {
|
---|
196 | if ($opt_d) {print STDERR "No keyword frequency file (default)\n";}
|
---|
197 | }
|
---|
198 |
|
---|
199 | # Count the number of files
|
---|
200 | $number_of_files = $#ARGV + 1;
|
---|
201 | if ($opt_d) {print STDERR "Number of files: $number_of_files\n\n";}
|
---|
202 |
|
---|
203 |
|
---|
204 |
|
---|
205 | # Set up working files
|
---|
206 |
|
---|
207 | $stem = "$gsdlhome/tmp/kea.$$";
|
---|
208 |
|
---|
209 | $data = "$stem.data";
|
---|
210 | $arff = "$stem.arff";
|
---|
211 | $out = "$stem.out";
|
---|
212 | $err = "$stem.err";
|
---|
213 |
|
---|
214 | `mkdir -m 777 $data`;
|
---|
215 |
|
---|
216 |
|
---|
217 | # Process each input file into a working file
|
---|
218 | if ($opt_d) {print STDERR "Preparing input files in: $data\n";}
|
---|
219 |
|
---|
220 | for ($f = 0; $f <= $#ARGV; $f++) {
|
---|
221 | print STDERR " document ", ($f+1), ": $ARGV[$f]\n";
|
---|
222 |
|
---|
223 | $file = $ARGV[$f];
|
---|
224 | $temp = "$data/$f";
|
---|
225 | $temp =~ s@//@/@g;
|
---|
226 | $original_filename{$temp} = $file;
|
---|
227 |
|
---|
228 | # copy the file to the data directory & coerce into a clause file
|
---|
229 | if ($file =~ /.*\.clauses/i) {
|
---|
230 | system("cp","$file","$temp.clauses");
|
---|
231 | } else {
|
---|
232 | if ($file =~ /.*\.te?xt/i) {
|
---|
233 | system("cp","$file","$temp.txt");
|
---|
234 | } elsif ($file =~ /.*\.html?/i) {
|
---|
235 | system("cp", "$file", "$temp.html");
|
---|
236 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
|
---|
237 | } elsif ($file =~ /.*\.cstr/) {
|
---|
238 | `cp $file $temp.cstr`;
|
---|
239 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
|
---|
240 | } else {
|
---|
241 | die "Unknown file type: $file\n";
|
---|
242 | }
|
---|
243 | # prepare the file
|
---|
244 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
|
---|
245 | }
|
---|
246 | }
|
---|
247 | #print STDERR "\n\n";
|
---|
248 |
|
---|
249 | # Build the arff file
|
---|
250 | $command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
|
---|
251 | $command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
|
---|
252 | $command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
|
---|
253 | $command .= " $data $arff";
|
---|
254 |
|
---|
255 | if ($opt_d) {
|
---|
256 | print STDERR "** $command **\n";
|
---|
257 | }
|
---|
258 | system ("$command");
|
---|
259 |
|
---|
260 |
|
---|
261 | # use KEP.java to extract phrases
|
---|
262 | $command = "$java_command KEP -m $model_file -T $arff";
|
---|
263 | $command .= " $number_of_phrases" if ($number_of_phrases);
|
---|
264 | # -R is correctly set by default in the java file when the model is loaded
|
---|
265 | # $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
|
---|
266 | $command .= " > $out 2> $err";
|
---|
267 |
|
---|
268 | if ($opt_d) {
|
---|
269 | print STDERR "** $command **\n";
|
---|
270 | }
|
---|
271 | system ("$command");
|
---|
272 |
|
---|
273 |
|
---|
274 |
|
---|
275 | # Read output file and create .kea files
|
---|
276 |
|
---|
277 | open(KEP, "<$out");
|
---|
278 |
|
---|
279 | $document = "";
|
---|
280 |
|
---|
281 | while (<KEP>) {
|
---|
282 |
|
---|
283 | chomp;
|
---|
284 | $line = $_;
|
---|
285 |
|
---|
286 | # new document
|
---|
287 | if ($line =~ "^Current document") {
|
---|
288 | # close the old document
|
---|
289 | if ($document) {
|
---|
290 | close(DOC);
|
---|
291 | }
|
---|
292 | # open start the new document
|
---|
293 | ($doc) = $line =~ /Current document: (.*)\.clauses$/;
|
---|
294 | $doc =~ s@//@/@g;
|
---|
295 | $document = $original_filename{$doc};
|
---|
296 | $document =~ s/\.[^\.]+$//;
|
---|
297 | $document .= ".$extension";
|
---|
298 | #print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
|
---|
299 | open(DOC, ">$document");
|
---|
300 |
|
---|
301 | } elsif ($line =~ "^Miss:") {
|
---|
302 | die "Trying to write with no current document!" if (!$document);
|
---|
303 | ($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
|
---|
304 | ($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
|
---|
305 | if ($output_tfidf) {
|
---|
306 | print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
|
---|
307 | } else {
|
---|
308 | print DOC "$unstemmed\t$stemmed\t$evidence\n";
|
---|
309 | }
|
---|
310 | }
|
---|
311 | }
|
---|
312 |
|
---|
313 |
|
---|
314 | #get rid of temporary files
|
---|
315 | if (!$opt_d) {
|
---|
316 | system ("rm", ("-r", "$data", "$arff", "$out", "$err"));
|
---|
317 | }
|
---|