source: trunk/gsdl/packages/kea/kea-3.0/KEAKeyphraseExtractor.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.9 KB
Line 
1/*
2 * KEAKeyphraseExtractor.java
3 * Copyright (C) 2001 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19import java.io.*;
20import java.util.*;
21import weka.core.*;
22import weka.filters.*;
23
24/**
25 * Extracts keyphrases from the documents in a given directory.
26 * Assumes that the file names for the documents end with ".txt".
27 * Puts extracted keyphrases into corresponding files ending with
28 * ".key" (if those are not already present). Optionally an encoding
29 * for the documents/keyphrases can be defined (e.g. for Chinese
30 * text). Documents for which ".key" exists, are used for evaluation.
31 *
32 * Valid options are:<p>
33 *
34 * -l "directory name"<br>
35 * Specifies name of directory.<p>
36 *
37 * -m "model name"<br>
38 * Specifies name of model.<p>
39 *
40 * -e "encoding"<br>
41 * Specifies encoding.<p>
42 *
43 * -n <br>
44 * Specifies number of phrases to be output (default: 5).<p>
45 *
46 * -d<br>
47 * Turns debugging mode on.<p>
48 *
49 * -a<br>
50 * Also write stemmed phrase and score into ".key" file.<p>
51 *
52 * @author Eibe Frank ([email protected])
53 * @version 1.0
54 */
55public class KEAKeyphraseExtractor implements OptionHandler {
56
57 /** Name of directory */
58 String m_dirName = null;
59
60 /** Name of model */
61 String m_modelName = null;
62
63 /** Encoding */
64 String m_encoding = "default";
65
66 /** Debugging mode? */
67 boolean m_debug = false;
68
69 /** The KEA filter object */
70 KEAFilter m_KEAFilter = null;
71
72 /** The number of phrases to extract. */
73 int m_numPhrases = 5;
74
75 /** Also write stemmed phrase and score into .key file. */
76 boolean m_AdditionalInfo = false;
77
78 /**
79 * Get the value of AdditionalInfo.
80 *
81 * @return Value of AdditionalInfo.
82 */
83 public boolean getAdditionalInfo() {
84
85 return m_AdditionalInfo;
86 }
87
88 /**
89 * Set the value of AdditionalInfo.
90 *
91 * @param newAdditionalInfo Value to assign to AdditionalInfo.
92 */
93 public void setAdditionalInfo(boolean newAdditionalInfo) {
94
95 m_AdditionalInfo = newAdditionalInfo;
96 }
97
98 /**
99 * Get the value of numPhrases.
100 *
101 * @return Value of numPhrases.
102 */
103 public int getNumPhrases() {
104
105 return m_numPhrases;
106 }
107
108 /**
109 * Set the value of numPhrases.
110 *
111 * @param newnumPhrases Value to assign to numPhrases.
112 */
113 public void setNumPhrases(int newnumPhrases) {
114
115 m_numPhrases = newnumPhrases;
116 }
117
118 /**
119 * Get the value of debug.
120 *
121 * @return Value of debug.
122 */
123 public boolean getDebug() {
124
125 return m_debug;
126 }
127
128 /**
129 * Set the value of debug.
130 *
131 * @param newdebug Value to assign to debug.
132 */
133 public void setDebug(boolean newdebug) {
134
135 m_debug = newdebug;
136 }
137
138 /**
139 * Get the value of encoding.
140 *
141 * @return Value of encoding.
142 */
143 public String getEncoding() {
144
145 return m_encoding;
146 }
147
148 /**
149 * Set the value of encoding.
150 *
151 * @param newencoding Value to assign to encoding.
152 */
153 public void setEncoding(String newencoding) {
154
155 m_encoding = newencoding;
156 }
157
158 /**
159 * Get the value of modelName.
160 *
161 * @return Value of modelName.
162 */
163 public String getModelName() {
164
165 return m_modelName;
166 }
167
168 /**
169 * Set the value of modelName.
170 *
171 * @param newmodelName Value to assign to modelName.
172 */
173 public void setModelName(String newmodelName) {
174
175 m_modelName = newmodelName;
176 }
177
178 /**
179 * Get the value of dirName.
180 *
181 * @return Value of dirName.
182 */
183 public String getDirName() {
184
185 return m_dirName;
186 }
187
188 /**
189 * Set the value of dirName.
190 *
191 * @param newdirName Value to assign to dirName.
192 */
193 public void setDirName(String newdirName) {
194
195 m_dirName = newdirName;
196 }
197
198 /**
199 * Parses a given list of options controlling the behaviour of this object.
200 * Valid options are:<p>
201 *
202 * -l "directory name"<br>
203 * Specifies name of directory.<p>
204 *
205 * -m "model name"<br>
206 * Specifies name of model.<p>
207 *
208 * -e "encoding"<br>
209 * Specifies encoding.<p>
210 *
211 * -n<br>
212 * Specifies number of phrases to be output (default: 5).<p>
213 *
214 * -d<br>
215 * Turns debugging mode on.<p>
216 *
217 * -a<br>
218 * Also write stemmed phrase and score into ".key" file.<p>
219 *
220 * @param options the list of options as an array of strings
221 * @exception Exception if an option is not supported
222 */
223 public void setOptions(String[] options) throws Exception {
224
225 String dirName = Utils.getOption('l', options);
226 if (dirName.length() > 0) {
227 setDirName(dirName);
228 } else {
229 setDirName(null);
230 throw new Exception("Name of directory required argument.");
231 }
232 String modelName = Utils.getOption('m', options);
233 if (modelName.length() > 0) {
234 setModelName(modelName);
235 } else {
236 setModelName(null);
237 throw new Exception("Name of model required argument.");
238 }
239 String encoding = Utils.getOption('e', options);
240 if (encoding.length() > 0) {
241 setEncoding(encoding);
242 } else {
243 setEncoding("default");
244 }
245 String numPhrases = Utils.getOption('n', options);
246 if (numPhrases.length() > 0) {
247 setNumPhrases(Integer.parseInt(numPhrases));
248 } else {
249 setNumPhrases(5);
250 }
251 setDebug(Utils.getFlag('d', options));
252 setAdditionalInfo(Utils.getFlag('a', options));
253 Utils.checkForRemainingOptions(options);
254 }
255
256 /**
257 * Gets the current option settings.
258 *
259 * @return an array of strings suitable for passing to setOptions
260 */
261 public String [] getOptions() {
262
263 String [] options = new String [10];
264 int current = 0;
265
266 options[current++] = "-l";
267 options[current++] = "" + (getDirName());
268 options[current++] = "-m";
269 options[current++] = "" + (getModelName());
270 options[current++] = "-e";
271 options[current++] = "" + (getEncoding());
272 options[current++] = "-n";
273 options[current++] = "" + (getNumPhrases());
274 if (getDebug()) {
275 options[current++] = "-d";
276 }
277 if (getAdditionalInfo()) {
278 options[current++] = "-a";
279 }
280
281 while (current < options.length) {
282 options[current++] = "";
283 }
284 return options;
285 }
286
287 /**
288 * Returns an enumeration describing the available options.
289 *
290 * @return an enumeration of all the available options
291 */
292 public Enumeration listOptions() {
293
294 Vector newVector = new Vector(6);
295
296 newVector.addElement(new Option(
297 "\tSpecifies name of directory.",
298 "l", 1, "-l <directory name>"));
299 newVector.addElement(new Option(
300 "\tSpecifies name of model.",
301 "m", 1, "-m <model name>"));
302 newVector.addElement(new Option(
303 "\tSpecifies encoding.",
304 "e", 1, "-e <encoding>"));
305 newVector.addElement(new Option(
306 "\tSpecifies number of phrases to be output (default: 5).",
307 "n", 1, "-n"));
308 newVector.addElement(new Option(
309 "\tTurns debugging mode on.",
310 "d", 0, "-d"));
311 newVector.addElement(new Option(
312 "\tAlso write stemmed phrase and score into \".key\" file.",
313 "a", 0, "-a"));
314
315 return newVector.elements();
316 }
317
318 /**
319 * Collects the stems of the file names.
320 */
321 public Hashtable collectStems() throws Exception {
322
323 Hashtable stems = new Hashtable();
324
325 try {
326 File dir = new File(m_dirName);
327 String[] files = dir.list();
328 for (int i = 0; i < files.length; i++) {
329 if (files[i].endsWith(".txt")) {
330 String stem = files[i].substring(0, files[i].length() - 4);
331 if (!stems.containsKey(stem)) {
332 stems.put(stem, new Double(0));
333 }
334 }
335 }
336 } catch (Exception e) {
337 throw new Exception("Problem opening directory " + m_dirName);
338 }
339 return stems;
340 }
341
342 /**
343 * Builds the model from the files
344 */
345 public void extractKeyphrases(Hashtable stems) throws Exception {
346
347 Vector stats = new Vector();
348
349 // Check whether there is actually any data
350 if (stems.size() == 0) {
351 throw new Exception("Couldn't find any data!");
352 }
353
354 FastVector atts = new FastVector(2);
355 atts.addElement(new Attribute("doc", null));
356 atts.addElement(new Attribute("keyphrases", null));
357 Instances data = new Instances("keyphrase_training_data", atts, 0);
358
359 // Extract keyphrases
360 Enumeration elem = stems.keys();
361 while (elem.hasMoreElements()) {
362 String str = (String)elem.nextElement();
363 double[] newInst = new double[2];
364 try {
365 File txt = new File(m_dirName + "/" + str + ".txt");
366 InputStreamReader is;
367 if (!m_encoding.equals("default")) {
368 is = new InputStreamReader(new FileInputStream(txt), m_encoding);
369 } else {
370 is = new InputStreamReader(new FileInputStream(txt));
371 }
372 StringBuffer txtStr = new StringBuffer();
373 int c;
374 while ((c = is.read()) != -1) {
375 txtStr.append((char)c);
376 }
377 newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
378 } catch (Exception e) {
379 if (m_debug) {
380 System.err.println("Can't read document " + str + ".txt");
381 }
382 newInst[0] = Instance.missingValue();
383 }
384 try {
385 File key = new File(m_dirName + "/" + str + ".key");
386 InputStreamReader is;
387 if (!m_encoding.equals("default")) {
388 is = new InputStreamReader(new FileInputStream(key), m_encoding);
389 } else {
390 is = new InputStreamReader(new FileInputStream(key));
391 }
392 StringBuffer keyStr = new StringBuffer();
393 int c;
394 while ((c = is.read()) != -1) {
395 keyStr.append((char)c);
396 }
397 newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
398 } catch (Exception e) {
399 if (m_debug) {
400 System.err.println("No keyphrases for stem " + str + ".");
401 }
402 newInst[1] = Instance.missingValue();
403 }
404 data.add(new Instance(1.0, newInst));
405 m_KEAFilter.input(data.instance(0));
406 data = data.stringFreeStructure();
407 if (m_debug) {
408 System.err.println("-- Document: " + str);
409 }
410 Instance[] topRankedInstances = new Instance[m_numPhrases];
411 Instance inst;
412 while ((inst = m_KEAFilter.output()) != null) {
413 int index = (int)inst.value(m_KEAFilter.getRankIndex()) - 1;
414 if (index < m_numPhrases) {
415 topRankedInstances[index] = inst;
416 }
417 }
418 if (m_debug) {
419 System.err.println("-- Keyphrases and feature values:");
420 }
421 FileOutputStream out = null;
422 PrintWriter printer = null;
423 File key = new File(m_dirName + "/" + str + ".key");
424 if (!key.exists()) {
425 out = new FileOutputStream(m_dirName + "/" + str + ".key");
426 if (!m_encoding.equals("default")) {
427 printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
428 } else {
429 printer = new PrintWriter(out);
430 }
431 }
432 double numExtracted = 0, numCorrect = 0;
433 for (int i = 0; i < m_numPhrases; i++) {
434 if (topRankedInstances[i] != null) {
435 if (!topRankedInstances[i].
436 isMissing(topRankedInstances[i].numAttributes() - 1)) {
437 numExtracted += 1.0;
438 }
439 if ((int)topRankedInstances[i].
440 value(topRankedInstances[i].numAttributes() - 1) ==
441 topRankedInstances[i].
442 attribute(topRankedInstances[i].numAttributes() - 1).
443 indexOfValue("True")) {
444 numCorrect += 1.0;
445 }
446 if (printer != null) {
447 printer.print(topRankedInstances[i].
448 stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
449 if (m_AdditionalInfo) {
450 printer.print("\t");
451 printer.print(topRankedInstances[i].
452 stringValue(m_KEAFilter.getStemmedPhraseIndex()));
453 printer.print("\t");
454 printer.print(Utils.
455 doubleToString(topRankedInstances[i].
456 value(m_KEAFilter.
457 getProbabilityIndex()), 4));
458 }
459 printer.println();
460 }
461 if (m_debug) {
462 System.err.println(topRankedInstances[i]);
463 }
464 }
465 }
466 if (numExtracted > 0) {
467 if (m_debug) {
468 System.err.println("-- " + numCorrect + " correct");
469 }
470 stats.addElement(new Double(numCorrect));
471 }
472 if (printer != null) {
473 printer.flush();
474 printer.close();
475 out.close();
476 }
477 }
478 double[] st = new double[stats.size()];
479 for (int i = 0; i < stats.size(); i++) {
480 st[i] = ((Double)stats.elementAt(i)).doubleValue();
481 }
482 double avg = Utils.mean(st);
483 double stdDev = Math.sqrt(Utils.variance(st));
484 System.err.println("Avg. number of correct keyphrases: " +
485 Utils.doubleToString(avg, 2) + " +/- " +
486 Utils.doubleToString(stdDev, 2));
487 System.err.println("Based on " + stats.size() + " documents");
488 m_KEAFilter.batchFinished();
489 }
490
491 /**
492 * Loads the extraction model from the file.
493 */
494 public void loadModel() throws Exception {
495
496 BufferedInputStream inStream =
497 new BufferedInputStream(new FileInputStream(m_modelName));
498 ObjectInputStream in = new ObjectInputStream(inStream);
499 m_KEAFilter = (KEAFilter)in.readObject();
500 in.close();
501 }
502
503 /**
504 * The main method.
505 */
506 public static void main(String[] ops) {
507
508 KEAKeyphraseExtractor kmb = new KEAKeyphraseExtractor();
509 try {
510 kmb.setOptions(ops);
511 System.err.print("Extracting keyphrases with options: ");
512 String[] optionSettings = kmb.getOptions();
513 for (int i = 0; i < optionSettings.length; i++) {
514 System.err.print(optionSettings[i] + " ");
515 }
516 System.err.println();
517 kmb.loadModel();
518 kmb.extractKeyphrases(kmb.collectStems());
519 } catch (Exception e) {
520 e.printStackTrace();
521 System.err.println(e.getMessage());
522 System.err.println("\nOptions:\n");
523 Enumeration enum = kmb.listOptions();
524 while (enum.hasMoreElements()) {
525 Option option = (Option) enum.nextElement();
526 System.err.println(option.synopsis());
527 System.err.println(option.description());
528 }
529 }
530 }
531}
532
Note: See TracBrowser for help on using the repository browser.