1 | /*
|
---|
2 | * KEAKeyphraseExtractor.java
|
---|
3 | * Copyright (C) 2001 Eibe Frank
|
---|
4 | *
|
---|
5 | * This program is free software; you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
18 | */
|
---|
19 | import java.io.*;
|
---|
20 | import java.util.*;
|
---|
21 | import weka.core.*;
|
---|
22 | import weka.filters.*;
|
---|
23 |
|
---|
24 | /**
|
---|
25 | * Extracts keyphrases from the documents in a given directory.
|
---|
26 | * Assumes that the file names for the documents end with ".txt".
|
---|
27 | * Puts extracted keyphrases into corresponding files ending with
|
---|
28 | * ".key" (if those are not already present). Optionally an encoding
|
---|
29 | * for the documents/keyphrases can be defined (e.g. for Chinese
|
---|
30 | * text). Documents for which ".key" exists, are used for evaluation.
|
---|
31 | *
|
---|
32 | * Valid options are:<p>
|
---|
33 | *
|
---|
34 | * -l "directory name"<br>
|
---|
35 | * Specifies name of directory.<p>
|
---|
36 | *
|
---|
37 | * -m "model name"<br>
|
---|
38 | * Specifies name of model.<p>
|
---|
39 | *
|
---|
40 | * -e "encoding"<br>
|
---|
41 | * Specifies encoding.<p>
|
---|
42 | *
|
---|
43 | * -n <br>
|
---|
44 | * Specifies number of phrases to be output (default: 5).<p>
|
---|
45 | *
|
---|
46 | * -d<br>
|
---|
47 | * Turns debugging mode on.<p>
|
---|
48 | *
|
---|
49 | * -a<br>
|
---|
50 | * Also write stemmed phrase and score into ".key" file.<p>
|
---|
51 | *
|
---|
52 | * @author Eibe Frank ([email protected])
|
---|
53 | * @version 1.0
|
---|
54 | */
|
---|
55 | public class KEAKeyphraseExtractor implements OptionHandler {
|
---|
56 |
|
---|
57 | /** Name of directory */
|
---|
58 | String m_dirName = null;
|
---|
59 |
|
---|
60 | /** Name of model */
|
---|
61 | String m_modelName = null;
|
---|
62 |
|
---|
63 | /** Encoding */
|
---|
64 | String m_encoding = "default";
|
---|
65 |
|
---|
66 | /** Debugging mode? */
|
---|
67 | boolean m_debug = false;
|
---|
68 |
|
---|
69 | /** The KEA filter object */
|
---|
70 | KEAFilter m_KEAFilter = null;
|
---|
71 |
|
---|
72 | /** The number of phrases to extract. */
|
---|
73 | int m_numPhrases = 5;
|
---|
74 |
|
---|
75 | /** Also write stemmed phrase and score into .key file. */
|
---|
76 | boolean m_AdditionalInfo = false;
|
---|
77 |
|
---|
78 | /**
|
---|
79 | * Get the value of AdditionalInfo.
|
---|
80 | *
|
---|
81 | * @return Value of AdditionalInfo.
|
---|
82 | */
|
---|
83 | public boolean getAdditionalInfo() {
|
---|
84 |
|
---|
85 | return m_AdditionalInfo;
|
---|
86 | }
|
---|
87 |
|
---|
88 | /**
|
---|
89 | * Set the value of AdditionalInfo.
|
---|
90 | *
|
---|
91 | * @param newAdditionalInfo Value to assign to AdditionalInfo.
|
---|
92 | */
|
---|
93 | public void setAdditionalInfo(boolean newAdditionalInfo) {
|
---|
94 |
|
---|
95 | m_AdditionalInfo = newAdditionalInfo;
|
---|
96 | }
|
---|
97 |
|
---|
98 | /**
|
---|
99 | * Get the value of numPhrases.
|
---|
100 | *
|
---|
101 | * @return Value of numPhrases.
|
---|
102 | */
|
---|
103 | public int getNumPhrases() {
|
---|
104 |
|
---|
105 | return m_numPhrases;
|
---|
106 | }
|
---|
107 |
|
---|
108 | /**
|
---|
109 | * Set the value of numPhrases.
|
---|
110 | *
|
---|
111 | * @param newnumPhrases Value to assign to numPhrases.
|
---|
112 | */
|
---|
113 | public void setNumPhrases(int newnumPhrases) {
|
---|
114 |
|
---|
115 | m_numPhrases = newnumPhrases;
|
---|
116 | }
|
---|
117 |
|
---|
118 | /**
|
---|
119 | * Get the value of debug.
|
---|
120 | *
|
---|
121 | * @return Value of debug.
|
---|
122 | */
|
---|
123 | public boolean getDebug() {
|
---|
124 |
|
---|
125 | return m_debug;
|
---|
126 | }
|
---|
127 |
|
---|
128 | /**
|
---|
129 | * Set the value of debug.
|
---|
130 | *
|
---|
131 | * @param newdebug Value to assign to debug.
|
---|
132 | */
|
---|
133 | public void setDebug(boolean newdebug) {
|
---|
134 |
|
---|
135 | m_debug = newdebug;
|
---|
136 | }
|
---|
137 |
|
---|
138 | /**
|
---|
139 | * Get the value of encoding.
|
---|
140 | *
|
---|
141 | * @return Value of encoding.
|
---|
142 | */
|
---|
143 | public String getEncoding() {
|
---|
144 |
|
---|
145 | return m_encoding;
|
---|
146 | }
|
---|
147 |
|
---|
148 | /**
|
---|
149 | * Set the value of encoding.
|
---|
150 | *
|
---|
151 | * @param newencoding Value to assign to encoding.
|
---|
152 | */
|
---|
153 | public void setEncoding(String newencoding) {
|
---|
154 |
|
---|
155 | m_encoding = newencoding;
|
---|
156 | }
|
---|
157 |
|
---|
158 | /**
|
---|
159 | * Get the value of modelName.
|
---|
160 | *
|
---|
161 | * @return Value of modelName.
|
---|
162 | */
|
---|
163 | public String getModelName() {
|
---|
164 |
|
---|
165 | return m_modelName;
|
---|
166 | }
|
---|
167 |
|
---|
168 | /**
|
---|
169 | * Set the value of modelName.
|
---|
170 | *
|
---|
171 | * @param newmodelName Value to assign to modelName.
|
---|
172 | */
|
---|
173 | public void setModelName(String newmodelName) {
|
---|
174 |
|
---|
175 | m_modelName = newmodelName;
|
---|
176 | }
|
---|
177 |
|
---|
178 | /**
|
---|
179 | * Get the value of dirName.
|
---|
180 | *
|
---|
181 | * @return Value of dirName.
|
---|
182 | */
|
---|
183 | public String getDirName() {
|
---|
184 |
|
---|
185 | return m_dirName;
|
---|
186 | }
|
---|
187 |
|
---|
188 | /**
|
---|
189 | * Set the value of dirName.
|
---|
190 | *
|
---|
191 | * @param newdirName Value to assign to dirName.
|
---|
192 | */
|
---|
193 | public void setDirName(String newdirName) {
|
---|
194 |
|
---|
195 | m_dirName = newdirName;
|
---|
196 | }
|
---|
197 |
|
---|
198 | /**
|
---|
199 | * Parses a given list of options controlling the behaviour of this object.
|
---|
200 | * Valid options are:<p>
|
---|
201 | *
|
---|
202 | * -l "directory name"<br>
|
---|
203 | * Specifies name of directory.<p>
|
---|
204 | *
|
---|
205 | * -m "model name"<br>
|
---|
206 | * Specifies name of model.<p>
|
---|
207 | *
|
---|
208 | * -e "encoding"<br>
|
---|
209 | * Specifies encoding.<p>
|
---|
210 | *
|
---|
211 | * -n<br>
|
---|
212 | * Specifies number of phrases to be output (default: 5).<p>
|
---|
213 | *
|
---|
214 | * -d<br>
|
---|
215 | * Turns debugging mode on.<p>
|
---|
216 | *
|
---|
217 | * -a<br>
|
---|
218 | * Also write stemmed phrase and score into ".key" file.<p>
|
---|
219 | *
|
---|
220 | * @param options the list of options as an array of strings
|
---|
221 | * @exception Exception if an option is not supported
|
---|
222 | */
|
---|
223 | public void setOptions(String[] options) throws Exception {
|
---|
224 |
|
---|
225 | String dirName = Utils.getOption('l', options);
|
---|
226 | if (dirName.length() > 0) {
|
---|
227 | setDirName(dirName);
|
---|
228 | } else {
|
---|
229 | setDirName(null);
|
---|
230 | throw new Exception("Name of directory required argument.");
|
---|
231 | }
|
---|
232 | String modelName = Utils.getOption('m', options);
|
---|
233 | if (modelName.length() > 0) {
|
---|
234 | setModelName(modelName);
|
---|
235 | } else {
|
---|
236 | setModelName(null);
|
---|
237 | throw new Exception("Name of model required argument.");
|
---|
238 | }
|
---|
239 | String encoding = Utils.getOption('e', options);
|
---|
240 | if (encoding.length() > 0) {
|
---|
241 | setEncoding(encoding);
|
---|
242 | } else {
|
---|
243 | setEncoding("default");
|
---|
244 | }
|
---|
245 | String numPhrases = Utils.getOption('n', options);
|
---|
246 | if (numPhrases.length() > 0) {
|
---|
247 | setNumPhrases(Integer.parseInt(numPhrases));
|
---|
248 | } else {
|
---|
249 | setNumPhrases(5);
|
---|
250 | }
|
---|
251 | setDebug(Utils.getFlag('d', options));
|
---|
252 | setAdditionalInfo(Utils.getFlag('a', options));
|
---|
253 | Utils.checkForRemainingOptions(options);
|
---|
254 | }
|
---|
255 |
|
---|
256 | /**
|
---|
257 | * Gets the current option settings.
|
---|
258 | *
|
---|
259 | * @return an array of strings suitable for passing to setOptions
|
---|
260 | */
|
---|
261 | public String [] getOptions() {
|
---|
262 |
|
---|
263 | String [] options = new String [10];
|
---|
264 | int current = 0;
|
---|
265 |
|
---|
266 | options[current++] = "-l";
|
---|
267 | options[current++] = "" + (getDirName());
|
---|
268 | options[current++] = "-m";
|
---|
269 | options[current++] = "" + (getModelName());
|
---|
270 | options[current++] = "-e";
|
---|
271 | options[current++] = "" + (getEncoding());
|
---|
272 | options[current++] = "-n";
|
---|
273 | options[current++] = "" + (getNumPhrases());
|
---|
274 | if (getDebug()) {
|
---|
275 | options[current++] = "-d";
|
---|
276 | }
|
---|
277 | if (getAdditionalInfo()) {
|
---|
278 | options[current++] = "-a";
|
---|
279 | }
|
---|
280 |
|
---|
281 | while (current < options.length) {
|
---|
282 | options[current++] = "";
|
---|
283 | }
|
---|
284 | return options;
|
---|
285 | }
|
---|
286 |
|
---|
287 | /**
|
---|
288 | * Returns an enumeration describing the available options.
|
---|
289 | *
|
---|
290 | * @return an enumeration of all the available options
|
---|
291 | */
|
---|
292 | public Enumeration listOptions() {
|
---|
293 |
|
---|
294 | Vector newVector = new Vector(6);
|
---|
295 |
|
---|
296 | newVector.addElement(new Option(
|
---|
297 | "\tSpecifies name of directory.",
|
---|
298 | "l", 1, "-l <directory name>"));
|
---|
299 | newVector.addElement(new Option(
|
---|
300 | "\tSpecifies name of model.",
|
---|
301 | "m", 1, "-m <model name>"));
|
---|
302 | newVector.addElement(new Option(
|
---|
303 | "\tSpecifies encoding.",
|
---|
304 | "e", 1, "-e <encoding>"));
|
---|
305 | newVector.addElement(new Option(
|
---|
306 | "\tSpecifies number of phrases to be output (default: 5).",
|
---|
307 | "n", 1, "-n"));
|
---|
308 | newVector.addElement(new Option(
|
---|
309 | "\tTurns debugging mode on.",
|
---|
310 | "d", 0, "-d"));
|
---|
311 | newVector.addElement(new Option(
|
---|
312 | "\tAlso write stemmed phrase and score into \".key\" file.",
|
---|
313 | "a", 0, "-a"));
|
---|
314 |
|
---|
315 | return newVector.elements();
|
---|
316 | }
|
---|
317 |
|
---|
318 | /**
|
---|
319 | * Collects the stems of the file names.
|
---|
320 | */
|
---|
321 | public Hashtable collectStems() throws Exception {
|
---|
322 |
|
---|
323 | Hashtable stems = new Hashtable();
|
---|
324 |
|
---|
325 | try {
|
---|
326 | File dir = new File(m_dirName);
|
---|
327 | String[] files = dir.list();
|
---|
328 | for (int i = 0; i < files.length; i++) {
|
---|
329 | if (files[i].endsWith(".txt")) {
|
---|
330 | String stem = files[i].substring(0, files[i].length() - 4);
|
---|
331 | if (!stems.containsKey(stem)) {
|
---|
332 | stems.put(stem, new Double(0));
|
---|
333 | }
|
---|
334 | }
|
---|
335 | }
|
---|
336 | } catch (Exception e) {
|
---|
337 | throw new Exception("Problem opening directory " + m_dirName);
|
---|
338 | }
|
---|
339 | return stems;
|
---|
340 | }
|
---|
341 |
|
---|
342 | /**
|
---|
343 | * Builds the model from the files
|
---|
344 | */
|
---|
345 | public void extractKeyphrases(Hashtable stems) throws Exception {
|
---|
346 |
|
---|
347 | Vector stats = new Vector();
|
---|
348 |
|
---|
349 | // Check whether there is actually any data
|
---|
350 | if (stems.size() == 0) {
|
---|
351 | throw new Exception("Couldn't find any data!");
|
---|
352 | }
|
---|
353 |
|
---|
354 | FastVector atts = new FastVector(2);
|
---|
355 | atts.addElement(new Attribute("doc", null));
|
---|
356 | atts.addElement(new Attribute("keyphrases", null));
|
---|
357 | Instances data = new Instances("keyphrase_training_data", atts, 0);
|
---|
358 |
|
---|
359 | // Extract keyphrases
|
---|
360 | Enumeration elem = stems.keys();
|
---|
361 | while (elem.hasMoreElements()) {
|
---|
362 | String str = (String)elem.nextElement();
|
---|
363 | double[] newInst = new double[2];
|
---|
364 | try {
|
---|
365 | File txt = new File(m_dirName + "/" + str + ".txt");
|
---|
366 | InputStreamReader is;
|
---|
367 | if (!m_encoding.equals("default")) {
|
---|
368 | is = new InputStreamReader(new FileInputStream(txt), m_encoding);
|
---|
369 | } else {
|
---|
370 | is = new InputStreamReader(new FileInputStream(txt));
|
---|
371 | }
|
---|
372 | StringBuffer txtStr = new StringBuffer();
|
---|
373 | int c;
|
---|
374 | while ((c = is.read()) != -1) {
|
---|
375 | txtStr.append((char)c);
|
---|
376 | }
|
---|
377 | newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
|
---|
378 | } catch (Exception e) {
|
---|
379 | if (m_debug) {
|
---|
380 | System.err.println("Can't read document " + str + ".txt");
|
---|
381 | }
|
---|
382 | newInst[0] = Instance.missingValue();
|
---|
383 | }
|
---|
384 | try {
|
---|
385 | File key = new File(m_dirName + "/" + str + ".key");
|
---|
386 | InputStreamReader is;
|
---|
387 | if (!m_encoding.equals("default")) {
|
---|
388 | is = new InputStreamReader(new FileInputStream(key), m_encoding);
|
---|
389 | } else {
|
---|
390 | is = new InputStreamReader(new FileInputStream(key));
|
---|
391 | }
|
---|
392 | StringBuffer keyStr = new StringBuffer();
|
---|
393 | int c;
|
---|
394 | while ((c = is.read()) != -1) {
|
---|
395 | keyStr.append((char)c);
|
---|
396 | }
|
---|
397 | newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
|
---|
398 | } catch (Exception e) {
|
---|
399 | if (m_debug) {
|
---|
400 | System.err.println("No keyphrases for stem " + str + ".");
|
---|
401 | }
|
---|
402 | newInst[1] = Instance.missingValue();
|
---|
403 | }
|
---|
404 | data.add(new Instance(1.0, newInst));
|
---|
405 | m_KEAFilter.input(data.instance(0));
|
---|
406 | data = data.stringFreeStructure();
|
---|
407 | if (m_debug) {
|
---|
408 | System.err.println("-- Document: " + str);
|
---|
409 | }
|
---|
410 | Instance[] topRankedInstances = new Instance[m_numPhrases];
|
---|
411 | Instance inst;
|
---|
412 | while ((inst = m_KEAFilter.output()) != null) {
|
---|
413 | int index = (int)inst.value(m_KEAFilter.getRankIndex()) - 1;
|
---|
414 | if (index < m_numPhrases) {
|
---|
415 | topRankedInstances[index] = inst;
|
---|
416 | }
|
---|
417 | }
|
---|
418 | if (m_debug) {
|
---|
419 | System.err.println("-- Keyphrases and feature values:");
|
---|
420 | }
|
---|
421 | FileOutputStream out = null;
|
---|
422 | PrintWriter printer = null;
|
---|
423 | File key = new File(m_dirName + "/" + str + ".key");
|
---|
424 | if (!key.exists()) {
|
---|
425 | out = new FileOutputStream(m_dirName + "/" + str + ".key");
|
---|
426 | if (!m_encoding.equals("default")) {
|
---|
427 | printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
|
---|
428 | } else {
|
---|
429 | printer = new PrintWriter(out);
|
---|
430 | }
|
---|
431 | }
|
---|
432 | double numExtracted = 0, numCorrect = 0;
|
---|
433 | for (int i = 0; i < m_numPhrases; i++) {
|
---|
434 | if (topRankedInstances[i] != null) {
|
---|
435 | if (!topRankedInstances[i].
|
---|
436 | isMissing(topRankedInstances[i].numAttributes() - 1)) {
|
---|
437 | numExtracted += 1.0;
|
---|
438 | }
|
---|
439 | if ((int)topRankedInstances[i].
|
---|
440 | value(topRankedInstances[i].numAttributes() - 1) ==
|
---|
441 | topRankedInstances[i].
|
---|
442 | attribute(topRankedInstances[i].numAttributes() - 1).
|
---|
443 | indexOfValue("True")) {
|
---|
444 | numCorrect += 1.0;
|
---|
445 | }
|
---|
446 | if (printer != null) {
|
---|
447 | printer.print(topRankedInstances[i].
|
---|
448 | stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
|
---|
449 | if (m_AdditionalInfo) {
|
---|
450 | printer.print("\t");
|
---|
451 | printer.print(topRankedInstances[i].
|
---|
452 | stringValue(m_KEAFilter.getStemmedPhraseIndex()));
|
---|
453 | printer.print("\t");
|
---|
454 | printer.print(Utils.
|
---|
455 | doubleToString(topRankedInstances[i].
|
---|
456 | value(m_KEAFilter.
|
---|
457 | getProbabilityIndex()), 4));
|
---|
458 | }
|
---|
459 | printer.println();
|
---|
460 | }
|
---|
461 | if (m_debug) {
|
---|
462 | System.err.println(topRankedInstances[i]);
|
---|
463 | }
|
---|
464 | }
|
---|
465 | }
|
---|
466 | if (numExtracted > 0) {
|
---|
467 | if (m_debug) {
|
---|
468 | System.err.println("-- " + numCorrect + " correct");
|
---|
469 | }
|
---|
470 | stats.addElement(new Double(numCorrect));
|
---|
471 | }
|
---|
472 | if (printer != null) {
|
---|
473 | printer.flush();
|
---|
474 | printer.close();
|
---|
475 | out.close();
|
---|
476 | }
|
---|
477 | }
|
---|
478 | double[] st = new double[stats.size()];
|
---|
479 | for (int i = 0; i < stats.size(); i++) {
|
---|
480 | st[i] = ((Double)stats.elementAt(i)).doubleValue();
|
---|
481 | }
|
---|
482 | double avg = Utils.mean(st);
|
---|
483 | double stdDev = Math.sqrt(Utils.variance(st));
|
---|
484 | System.err.println("Avg. number of correct keyphrases: " +
|
---|
485 | Utils.doubleToString(avg, 2) + " +/- " +
|
---|
486 | Utils.doubleToString(stdDev, 2));
|
---|
487 | System.err.println("Based on " + stats.size() + " documents");
|
---|
488 | m_KEAFilter.batchFinished();
|
---|
489 | }
|
---|
490 |
|
---|
491 | /**
|
---|
492 | * Loads the extraction model from the file.
|
---|
493 | */
|
---|
494 | public void loadModel() throws Exception {
|
---|
495 |
|
---|
496 | BufferedInputStream inStream =
|
---|
497 | new BufferedInputStream(new FileInputStream(m_modelName));
|
---|
498 | ObjectInputStream in = new ObjectInputStream(inStream);
|
---|
499 | m_KEAFilter = (KEAFilter)in.readObject();
|
---|
500 | in.close();
|
---|
501 | }
|
---|
502 |
|
---|
503 | /**
|
---|
504 | * The main method.
|
---|
505 | */
|
---|
506 | public static void main(String[] ops) {
|
---|
507 |
|
---|
508 | KEAKeyphraseExtractor kmb = new KEAKeyphraseExtractor();
|
---|
509 | try {
|
---|
510 | kmb.setOptions(ops);
|
---|
511 | System.err.print("Extracting keyphrases with options: ");
|
---|
512 | String[] optionSettings = kmb.getOptions();
|
---|
513 | for (int i = 0; i < optionSettings.length; i++) {
|
---|
514 | System.err.print(optionSettings[i] + " ");
|
---|
515 | }
|
---|
516 | System.err.println();
|
---|
517 | kmb.loadModel();
|
---|
518 | kmb.extractKeyphrases(kmb.collectStems());
|
---|
519 | } catch (Exception e) {
|
---|
520 | e.printStackTrace();
|
---|
521 | System.err.println(e.getMessage());
|
---|
522 | System.err.println("\nOptions:\n");
|
---|
523 | Enumeration enum = kmb.listOptions();
|
---|
524 | while (enum.hasMoreElements()) {
|
---|
525 | Option option = (Option) enum.nextElement();
|
---|
526 | System.err.println(option.synopsis());
|
---|
527 | System.err.println(option.description());
|
---|
528 | }
|
---|
529 | }
|
---|
530 | }
|
---|
531 | }
|
---|
532 |
|
---|