1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | # Kea
|
---|
4 | # Version 1.1.4
|
---|
5 |
|
---|
6 | # Kea -- Automatic Keyphrase Extraction
|
---|
7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
8 | # Contact [email protected] or [email protected]
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 |
|
---|
24 | # Version history
|
---|
25 | #
|
---|
26 | # 1.0 Witten et.al.
|
---|
27 | # 1.0.1 Bug: stopword file loaded as model file
|
---|
28 | # 1.0.2 Java paths explicit for nikau; JIT compiler
|
---|
29 | # 1.0.3 Include tf.idf in output if -t is set
|
---|
30 | # 1.0.4 Allow optional keyphrase frequency file
|
---|
31 | # 1.0.5 Use $perl_command and $java_command for system-indepence
|
---|
32 | # 1.0.6 -C argument selects model, stopword file, df file
|
---|
33 | # 1.0.7 Changes to Kea.pl.
|
---|
34 | # This is Phillip's version
|
---|
35 | # 1.0.8 Accepts .htm as well as .html
|
---|
36 | # 1.0.9 Accepts .text as well as .txt
|
---|
37 | # 1.1 First Distribution. GPL added. Documentation added.
|
---|
38 | # 1.1.1 -E argument sets output extension (default is .kea)
|
---|
39 | # 1.1.2 Documented java variables
|
---|
40 | # Maximum phrase length can be set at command-line.
|
---|
41 | # Note: default=3; NOT the length for the model. Sorry.
|
---|
42 | # 1.1.3 Moved Lynx command into separate script that checks
|
---|
43 | # for circumstances that are likely to crash it.
|
---|
44 | # 1.1.4 Updated documentation and added a few extra files.
|
---|
45 |
|
---|
46 | print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n";
|
---|
47 |
|
---|
48 | $gsdlhome = $ENV{'GSDLHOME'};
|
---|
49 |
|
---|
50 | `gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`;
|
---|
51 |
|
---|
52 |
|
---|
53 | # Kea runs other perl scripts in shells.
|
---|
54 | $perl_command = "perl -w";
|
---|
55 |
|
---|
56 | # Java is a bit more difficult.
|
---|
57 | # set this variable to your java home directory
|
---|
58 | #$java_home = "/usr/local/jdk";
|
---|
59 | $java_home = "/usr/local/share/java";
|
---|
60 |
|
---|
61 | # this variable will hold the CLASSPATH for java which we set at
|
---|
62 | # the command line to incorporate jaws.jar; you shouldn't need to change it
|
---|
63 | $java_classpath = ".:$gsdlhome/perllib/Kea-1.1.4/jaws.jar:$gsdlhome/perllib/Kea-1.1.4:$java_home/lib/classes.zip";
|
---|
64 |
|
---|
65 | # The name of your java just-in-time compiler. I use TYA.
|
---|
66 | # An empty string means you don;t have a JIT compiler.
|
---|
67 | $java_JIT_compiler = "";
|
---|
68 | #$java_JIT_compiler = "tya";
|
---|
69 |
|
---|
70 | # If you want to give java lots of memory or use other arguments,
|
---|
71 | # use this variable, otherwise make it an empty string.
|
---|
72 | $java_extra_args = "";
|
---|
73 | $java_extra_args = "-ss100000000 -oss100000000 -mx200000000";
|
---|
74 |
|
---|
75 | # The actual java command is based on these other variables:
|
---|
76 | $java_command = "$java_home/bin/java -classpath \"$java_classpath\"";
|
---|
77 | $java_command .= " -Djava.compiler=$java_JIT_compiler" if ($java_JIT_compiler);
|
---|
78 | $java_command .= " $java_extra_args" if ($java_extra_args);
|
---|
79 |
|
---|
80 |
|
---|
81 | # Parse command line options
|
---|
82 | require("getopts.pl");
|
---|
83 | &Getopts("dtN:E:C:F:K:L:M:S:");
|
---|
84 |
|
---|
85 | # What files shall we use?
|
---|
86 | if (!$ARGV[0]) {
|
---|
87 | die "Usage: Kea [options] text-or-html-or-cstr-files
|
---|
88 | Options:
|
---|
89 | -d Debug mode
|
---|
90 | -t Ouput TF.IDF
|
---|
91 | -N n Output n keyphrases
|
---|
92 | -L n Maximum phrases length is n (default = 3)
|
---|
93 | -E <suffix> Output extension is <suffix>
|
---|
94 | -C <corpus> Use model/df/kf/stopwords based on <corpus>
|
---|
95 | -F <document-frequency file>
|
---|
96 | -K <keyphrase-frequency file>
|
---|
97 | -M <Naive-Bayes model file>
|
---|
98 | -S <stopword file>
|
---|
99 | See README for more detail.
|
---|
100 | ";
|
---|
101 | }
|
---|
102 |
|
---|
103 |
|
---|
104 | # Number of phrases to extract
|
---|
105 | if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
|
---|
106 | $number_of_phrases = "-N $opt_N";
|
---|
107 | print STDERR "Number of phrases to extract: $opt_N\n";
|
---|
108 | } elsif ($opt_N) {
|
---|
109 | die "Kea cannot understand -N argument (must be a number): $opt_N\n";
|
---|
110 | } else {
|
---|
111 | print STDERR "Number of phrases dictated by model (default)\n";
|
---|
112 | $number_of_phrases = "";
|
---|
113 | }
|
---|
114 |
|
---|
115 | # -L maximum phrase length
|
---|
116 | $maximum_phrase_length = 0;
|
---|
117 | if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
|
---|
118 | $maximum_phrase_length = "$opt_L";
|
---|
119 | print STDERR "Maximum phrase length: $opt_L\n";
|
---|
120 | } elsif ($opt_L) {
|
---|
121 | die "Kea cannot understand -L argument (must be a number): $opt_L\n";
|
---|
122 | }
|
---|
123 |
|
---|
124 | # -E What extension shall we use?
|
---|
125 | $extension = "kea";
|
---|
126 | if ($opt_E) {
|
---|
127 | $extension = $opt_E;
|
---|
128 | $extension =~ s/^\.//g;
|
---|
129 | print STDERR "Using output extension: $extension\n";
|
---|
130 | }
|
---|
131 |
|
---|
132 | # -t Should we output tfidf? (This option is used by Kniles.)
|
---|
133 | if ($opt_t && $opt_t) {
|
---|
134 | $output_tfidf = 1;
|
---|
135 | print STDERR "Do print tf.idf\n";
|
---|
136 | } else {
|
---|
137 | $output_tfidf = 0;
|
---|
138 | }
|
---|
139 |
|
---|
140 |
|
---|
141 | # -C Corpus file stem
|
---|
142 | $default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
|
---|
143 | if ($opt_C) {
|
---|
144 | print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
|
---|
145 | $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
|
---|
146 | }
|
---|
147 |
|
---|
148 | # set the default model etc
|
---|
149 | $model_file = "$default_stem.model";
|
---|
150 | $stopword_file = "$default_stem.stopwords";
|
---|
151 | $frequency_file = "$default_stem.df";
|
---|
152 | $keyword_frequency_file = "";
|
---|
153 | $keyword_frequency_file = "$default_stem.kf" if (-e "$default_stem.kf");
|
---|
154 |
|
---|
155 |
|
---|
156 | # -F Document Frequency file
|
---|
157 | $frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F);
|
---|
158 | print STDERR "Document frequency file: $frequency_file\n";
|
---|
159 | die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
|
---|
160 |
|
---|
161 | # -M Model file
|
---|
162 | $model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M);
|
---|
163 | print STDERR "Model file: $model_file\n";
|
---|
164 | die "Model file does not exist!\n" if (!(-e $model_file));
|
---|
165 |
|
---|
166 | # -S Stopword file
|
---|
167 | $stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S);
|
---|
168 | print STDERR "Stopword file: $stopword_file\n";
|
---|
169 | die "Stopword file does not exist!\n" if (!(-e $stopword_file));
|
---|
170 |
|
---|
171 | # -K Keyword frequency file
|
---|
172 | $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K);
|
---|
173 | if ($keyword_frequency_file) {
|
---|
174 | print STDERR "Keyword frequency file: $keyword_frequency_file\n";
|
---|
175 | die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file));
|
---|
176 | } else {
|
---|
177 | print STDERR "No keyword frequency file (default)\n";
|
---|
178 | }
|
---|
179 |
|
---|
180 | # Count the number of files
|
---|
181 | $number_of_files = $#ARGV + 1;
|
---|
182 | print STDERR "Number of files: $number_of_files\n\n";
|
---|
183 |
|
---|
184 |
|
---|
185 |
|
---|
186 | # Set up working files
|
---|
187 |
|
---|
188 | $stem = "$gsdlhome/tmp/kea.$$";
|
---|
189 |
|
---|
190 | $data = "$stem.data";
|
---|
191 | $arff = "$stem.arff";
|
---|
192 | $out = "$stem.out";
|
---|
193 | $err = "$stem.err";
|
---|
194 |
|
---|
195 | `mkdir -m 777 $data`;
|
---|
196 |
|
---|
197 |
|
---|
198 | # Process each input file into a working file
|
---|
199 | print STDERR "Preparing input files in: $data\n";
|
---|
200 |
|
---|
201 | for ($f = 0; $f <= $#ARGV; $f++) {
|
---|
202 | print STDERR " document ", ($f+1), ": $ARGV[$f]\r";
|
---|
203 |
|
---|
204 | $file = $ARGV[$f];
|
---|
205 | $temp = "$data/$f";
|
---|
206 | $original_filename{$temp} = $file;
|
---|
207 |
|
---|
208 | # copy the file to the data directory & coerce into a clause file
|
---|
209 | if ($file =~ /.*\.clauses/) {
|
---|
210 | `cp $file $temp.clauses`;
|
---|
211 | } else {
|
---|
212 | if ($file =~ /.*\.te?xt/) {
|
---|
213 | `cp $file $temp.txt`;
|
---|
214 | } elsif ($file =~ /.*\.html?/i) {
|
---|
215 | `cp $file $temp.html`;
|
---|
216 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/convert-html-to-text.pl $temp.html > $temp.txt`;
|
---|
217 | } elsif ($file =~ /.*\.cstr/) {
|
---|
218 | `cp $file $temp.cstr`;
|
---|
219 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/cstr-to-text.pl $temp.cstr $temp.txt`;
|
---|
220 | } else {
|
---|
221 | die "Unknown file type: $file\n";
|
---|
222 | }
|
---|
223 | # prepare the file
|
---|
224 | `$perl_command $gsdlhome/perllib/Kea-1.1.4/prepare-clauses.pl $temp.txt $temp.clauses`;
|
---|
225 | }
|
---|
226 | }
|
---|
227 | #print STDERR "\n\n";
|
---|
228 |
|
---|
229 | # Build the arff file
|
---|
230 | $command = "$perl_command $gsdlhome/perllib/Kea-1.1.4/k4.pl -S $stopword_file -f $frequency_file";
|
---|
231 | $command .= " -K $keyword_frequency_file" if ($keyword_frequency_file);
|
---|
232 | $command .= " -L $maximum_phrase_length" if ($maximum_phrase_length);
|
---|
233 | $command .= " $data $arff";
|
---|
234 |
|
---|
235 | #print STDERR "** $command **\n";
|
---|
236 | `$command`;
|
---|
237 |
|
---|
238 |
|
---|
239 | # use KEP.java to extract phrases
|
---|
240 | $command = "$java_command KEP -m $model_file -T $arff";
|
---|
241 | $command .= " $number_of_phrases" if ($number_of_phrases);
|
---|
242 | # -R is correctly set by default in the java file when the model is loaded
|
---|
243 | # $command .= " -R tfidf,first_occurrence,keyword_freq,class" if ($keyword_frequency_file);
|
---|
244 | $command .= " > $out 2> $err";
|
---|
245 |
|
---|
246 | print STDERR "** $command **\n";
|
---|
247 | `$command`;
|
---|
248 |
|
---|
249 |
|
---|
250 |
|
---|
251 | # Read output file and create .kea files
|
---|
252 |
|
---|
253 | open(KEP, "<$out");
|
---|
254 |
|
---|
255 | $document = "";
|
---|
256 |
|
---|
257 | while (<KEP>) {
|
---|
258 |
|
---|
259 | chomp;
|
---|
260 | $line = $_;
|
---|
261 |
|
---|
262 | # new document
|
---|
263 | if ($line =~ "^Current document") {
|
---|
264 | # close the old document
|
---|
265 | if ($document) {
|
---|
266 | close(DOC);
|
---|
267 | }
|
---|
268 | # open start the new document
|
---|
269 | ($doc) = $line =~ /Current document: (.*)\.clauses$/;
|
---|
270 | $document = $original_filename{$doc};
|
---|
271 | $document =~ s/\.[^\.]+$//;
|
---|
272 | $document .= ".$extension";
|
---|
273 | #print STDERR "Opening DOC: $original_filename{$doc} => $document\n";
|
---|
274 | open(DOC, ">$document");
|
---|
275 |
|
---|
276 | } elsif ($line =~ "^Miss:") {
|
---|
277 | die "Trying to write with no current document!" if (!$document);
|
---|
278 | ($phrase, $tfidf, $evidence) = $line =~ /Miss: ([^,]+).*,([^,]+),no (.+)$/;
|
---|
279 | ($stemmed, $unstemmed) = $phrase =~ /\'(.+) \((.+)\)\'/;
|
---|
280 | if ($output_tfidf) {
|
---|
281 | print DOC "$unstemmed\t$stemmed\t$evidence\t$tfidf\n";
|
---|
282 | } else {
|
---|
283 | print DOC "$unstemmed\t$stemmed\t$evidence\n";
|
---|
284 | }
|
---|
285 | }
|
---|
286 | }
|
---|
287 |
|
---|
288 |
|
---|
289 | #get rid of temporary files
|
---|
290 | #if (!$opt_d && !$opt_d) {
|
---|
291 | # `rm -r $data $arff $out $err`;
|
---|
292 | #}
|
---|
293 |
|
---|
294 |
|
---|
295 |
|
---|
296 |
|
---|
297 |
|
---|
298 |
|
---|
299 |
|
---|