source: trunk/gsdl/packages/kea/kea-3.0/KEAModelBuilder.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1/*
2 * KEAModelBuilder.java
3 * Copyright (C) 2001 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20import java.io.*;
21import java.util.*;
22import weka.core.*;
23import weka.filters.*;
24
25/**
26 * Builds a keyphrase extraction model from the documents in a given
27 * directory. Assumes that the file names for the documents end with
28 * ".txt". Assumes that files containing corresponding
29 * author-assigned keyphrases end with ".key". Optionally an encoding
30 * for the documents/keyphrases can be defined (e.g. for Chinese
31 * text).
32 *
33 * Valid options are:<p>
34 *
35 * -l "directory name"<br>
36 * Specifies name of directory.<p>
37 *
38 * -m "model name"<br>
39 * Specifies name of model.<p>
40 *
41 * -e "encoding"<br>
42 * Specifies encoding.<p>
43 *
44 * -d<br>
45 * Turns debugging mode on.<p>
46 *
47 * -k<br>
48 * Use keyphrase frequency statistic.<p>
49 *
50 * -p<br>
51 * Disallow internal periods.<p>
52 *
53 * -x "length"<br>
54 * Sets maximum phrase length (default: 3).<p>
55 *
56 * -y "length"<br>
57 * Sets minimum phrase length (default: 1).<p>
58 *
59 * -o "number"<br>
60 * The minimum number of times a phrase needs to occur (default: 2). <p>
61 *
62 * -s "name of class implementing list of stop words"<br>
63 * Sets list of stop words to used (default: StopwordsEnglish).<p>
64 *
65 * -t "name of class implementing stemmer"<br>
66 * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
67 *
68 * -n<br>
69 * Do not check for proper nouns. <p>
70 *
71 * @author Eibe Frank ([email protected])
72 * @version 1.0
73 */
74public class KEAModelBuilder implements OptionHandler {
75
76 /** Name of directory */
77 String m_dirName = null;
78
79 /** Name of model */
80 String m_modelName = null;
81
82 /** Encoding */
83 String m_encoding = "default";
84
85 /** Debugging mode? */
86 boolean m_debug = false;
87
88 /** Use keyphrase frequency attribute? */
89 boolean m_useKFrequency = false;
90
91 /** Disallow internal periods? */
92 boolean m_disallowIPeriods = false;
93
94 /** The maximum length of phrases */
95 private int m_MaxPhraseLength = 3;
96
97 /** The minimum length of phrases */
98 private int m_MinPhraseLength = 1;
99
100 /** The minimum number of occurences of a phrase */
101 private int m_MinNumOccur = 2;
102
103 /** The KEA filter object */
104 KEAFilter m_KEAFilter = null;
105
106 /** The stemmer to be used */
107 private Stemmer m_Stemmer = new IteratedLovinsStemmer();
108
109 /** The list of stop words to be used */
110 private Stopwords m_Stopwords = new StopwordsEnglish();
111
112 /** Determines whether check for proper nouns is performed */
113 private boolean m_CheckForProperNouns = true;
114
115 /**
116 * Get the M_CheckProperNouns value.
117 * @return the M_CheckProperNouns value.
118 */
119 public boolean getCheckForProperNouns() {
120 return m_CheckForProperNouns;
121 }
122
123 /**
124 * Set the M_CheckProperNouns value.
125 * @param newM_CheckProperNouns The new M_CheckProperNouns value.
126 */
127 public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
128 this.m_CheckForProperNouns = newM_CheckProperNouns;
129 }
130
131 /**
132 * Get the M_Stopwords value.
133 * @return the M_Stopwords value.
134 */
135 public Stopwords getStopwords() {
136
137 return m_Stopwords;
138 }
139
140 /**
141 * Set the M_Stopwords value.
142 * @param newM_Stopwords The new M_Stopwords value.
143 */
144 public void setStopwords(Stopwords newM_Stopwords) {
145
146 this.m_Stopwords = newM_Stopwords;
147 }
148
149
150 /**
151 * Get the Stemmer value.
152 * @return the Stemmer value.
153 */
154 public Stemmer getStemmer() {
155
156 return m_Stemmer;
157 }
158
159 /**
160 * Set the Stemmer value.
161 * @param newStemmer The new Stemmer value.
162 */
163 public void setStemmer(Stemmer newStemmer) {
164
165 this.m_Stemmer = newStemmer;
166 }
167
168 /**
169 * Get the value of MinNumOccur.
170 *
171 * @return Value of MinNumOccur.
172 */
173 public int getMinNumOccur() {
174
175 return m_MinNumOccur;
176 }
177
178 /**
179 * Set the value of MinNumOccur.
180 *
181 * @param newMinNumOccur Value to assign to MinNumOccur.
182 */
183 public void setMinNumOccur(int newMinNumOccur) {
184
185 m_MinNumOccur = newMinNumOccur;
186 }
187
188 /**
189 * Get the value of MaxPhraseLength.
190 *
191 * @return Value of MaxPhraseLength.
192 */
193 public int getMaxPhraseLength() {
194
195 return m_MaxPhraseLength;
196 }
197
198 /**
199 * Set the value of MaxPhraseLength.
200 *
201 * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
202 */
203 public void setMaxPhraseLength(int newMaxPhraseLength) {
204
205 m_MaxPhraseLength = newMaxPhraseLength;
206 }
207
208 /**
209 * Get the value of MinPhraseLength.
210 *
211 * @return Value of MinPhraseLength.
212 */
213 public int getMinPhraseLength() {
214
215 return m_MinPhraseLength;
216 }
217
218 /**
219 * Set the value of MinPhraseLength.
220 *
221 * @param newMinPhraseLength Value to assign to MinPhraseLength.
222 */
223 public void setMinPhraseLength(int newMinPhraseLength) {
224
225 m_MinPhraseLength = newMinPhraseLength;
226 }
227
228 /**
229 * Get the value of disallowIPeriods.
230 *
231 * @return Value of disallowIPeriods.
232 */
233 public boolean getDisallowIPeriods() {
234
235 return m_disallowIPeriods;
236 }
237
238 /**
239 * Set the value of disallowIPeriods.
240 *
241 * @param newdisallowIPeriods Value to assign to disallowIPeriods.
242 */
243 public void setDisallowIPeriods(boolean newdisallowIPeriods) {
244
245 m_disallowIPeriods = newdisallowIPeriods;
246 }
247
248 /**
249 * Get the value of useKFrequency.
250 *
251 * @return Value of useKFrequency.
252 */
253 public boolean getUseKFrequency() {
254
255 return m_useKFrequency;
256 }
257
258 /**
259 * Set the value of useKFrequency.
260 *
261 * @param newuseKFrequency Value to assign to useKFrequency.
262 */
263 public void setUseKFrequency(boolean newuseKFrequency) {
264
265 m_useKFrequency = newuseKFrequency;
266 }
267
268 /**
269 * Get the value of debug.
270 *
271 * @return Value of debug.
272 */
273 public boolean getDebug() {
274
275 return m_debug;
276 }
277
278 /**
279 * Set the value of debug.
280 *
281 * @param newdebug Value to assign to debug.
282 */
283 public void setDebug(boolean newdebug) {
284
285 m_debug = newdebug;
286 }
287
288 /**
289 * Get the value of encoding.
290 *
291 * @return Value of encoding.
292 */
293 public String getEncoding() {
294
295 return m_encoding;
296 }
297
298 /**
299 * Set the value of encoding.
300 *
301 * @param newencoding Value to assign to encoding.
302 */
303 public void setEncoding(String newencoding) {
304
305 m_encoding = newencoding;
306 }
307
308 /**
309 * Get the value of modelName.
310 *
311 * @return Value of modelName.
312 */
313 public String getModelName() {
314
315 return m_modelName;
316 }
317
318 /**
319 * Set the value of modelName.
320 *
321 * @param newmodelName Value to assign to modelName.
322 */
323 public void setModelName(String newmodelName) {
324
325 m_modelName = newmodelName;
326 }
327
328 /**
329 * Get the value of dirName.
330 *
331 * @return Value of dirName.
332 */
333 public String getDirName() {
334
335 return m_dirName;
336 }
337
338 /**
339 * Set the value of dirName.
340 *
341 * @param newdirName Value to assign to dirName.
342 */
343 public void setDirName(String newdirName) {
344
345 m_dirName = newdirName;
346 }
347
348 /**
349 * Parses a given list of options controlling the behaviour of this object.
350 * Valid options are:<p>
351 *
352 * -l "directory name" <br>
353 * Specifies name of directory.<p>
354 *
355 * -m "model name" <br>
356 * Specifies name of model.<p>
357 *
358 * -e "encoding" <br>
359 * Specifies encoding.<p>
360 *
361 * -d<br>
362 * Turns debugging mode on.<p>
363 *
364 * -k<br>
365 * Use keyphrase frequency statistic.<p>
366 *
367 * -p<br>
368 * Disallow internal periods. <p>
369 *
370 * -x "length"<br>
371 * Sets maximum phrase length (default: 3).<p>
372 *
373 * -y "length"<br>
374 * Sets minimum phrase length (default: 3).<p>
375 *
376 * -o "number"<br>
377 * The minimum number of times a phrase needs to occur (default: 2). <p>
378 *
379 * -s "name of class implementing list of stop words"<br>
380 * Sets list of stop words to used (default: StopwordsEnglish).<p>
381 *
382 * -t "name of class implementing stemmer"<br>
383 * Sets stemmer to use (default: IteratedLovinsStemmer). <p>
384 *
385 * -n<br>
386 * Do not check for proper nouns. <p>
387 *
388 * @param options the list of options as an array of strings
389 * @exception Exception if an option is not supported
390 */
391 public void setOptions(String[] options) throws Exception {
392
393 String dirName = Utils.getOption('l', options);
394 if (dirName.length() > 0) {
395 setDirName(dirName);
396 } else {
397 setDirName(null);
398 throw new Exception("Name of directory required argument.");
399 }
400 String modelName = Utils.getOption('m', options);
401 if (modelName.length() > 0) {
402 setModelName(modelName);
403 } else {
404 setModelName(null);
405 throw new Exception("Name of model required argument.");
406 }
407 String encoding = Utils.getOption('e', options);
408 if (encoding.length() > 0) {
409 setEncoding(encoding);
410 } else {
411 setEncoding("default");
412 }
413 String maxPhraseLengthString = Utils.getOption('x', options);
414 if (maxPhraseLengthString.length() > 0) {
415 setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
416 } else {
417 setMaxPhraseLength(3);
418 }
419 String minPhraseLengthString = Utils.getOption('y', options);
420 if (minPhraseLengthString.length() > 0) {
421 setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
422 } else {
423 setMinPhraseLength(1);
424 }
425 String minNumOccurString = Utils.getOption('o', options);
426 if (minNumOccurString.length() > 0) {
427 setMinNumOccur(Integer.parseInt(minNumOccurString));
428 } else {
429 setMinNumOccur(2);
430 }
431 String stopwordsString = Utils.getOption('s', options);
432 if (stopwordsString.length() > 0) {
433 setStopwords((Stopwords)Class.forName(stopwordsString).newInstance());
434 }
435 String stemmerString = Utils.getOption('t', options);
436 if (stemmerString.length() > 0) {
437 setStemmer((Stemmer)Class.forName(stemmerString).newInstance());
438 }
439 setDebug(Utils.getFlag('d', options));
440 setUseKFrequency(Utils.getFlag('k', options));
441 setDisallowIPeriods(Utils.getFlag('p', options));
442 setCheckForProperNouns(!Utils.getFlag('n', options));
443 Utils.checkForRemainingOptions(options);
444 }
445
446 /**
447 * Gets the current option settings.
448 *
449 * @return an array of strings suitable for passing to setOptions
450 */
451 public String [] getOptions() {
452
453 String [] options = new String [20];
454 int current = 0;
455
456 options[current++] = "-l";
457 options[current++] = "" + (getDirName());
458 options[current++] = "-m";
459 options[current++] = "" + (getModelName());
460 options[current++] = "-e";
461 options[current++] = "" + (getEncoding());
462 if (getUseKFrequency()) {
463 options[current++] = "-k";
464 }
465 if (getDebug()) {
466 options[current++] = "-d";
467 }
468 if (getDisallowIPeriods()) {
469 options[current++] = "-p";
470 }
471 options[current++] = "-x";
472 options[current++] = "" + (getMaxPhraseLength());
473 options[current++] = "-y";
474 options[current++] = "" + (getMinPhraseLength());
475 options[current++] = "-o";
476 options[current++] = "" + (getMinNumOccur());
477 options[current++] = "-s";
478 options[current++] = "" + (getStopwords().getClass().getName());
479 options[current++] = "-t";
480 options[current++] = "" + (getStemmer().getClass().getName());
481 if (getCheckForProperNouns()) {
482 options[current++] = "-n";
483 }
484
485 while (current < options.length) {
486 options[current++] = "";
487 }
488 return options;
489 }
490
491 /**
492 * Returns an enumeration describing the available options.
493 *
494 * @return an enumeration of all the available options
495 */
496 public Enumeration listOptions() {
497
498 Vector newVector = new Vector(12);
499
500 newVector.addElement(new Option(
501 "\tSpecifies name of directory.",
502 "l", 1, "-l <directory name>"));
503 newVector.addElement(new Option(
504 "\tSpecifies name of model.",
505 "m", 1, "-m <model name>"));
506 newVector.addElement(new Option(
507 "\tSpecifies encoding.",
508 "e", 1, "-e <encoding>"));
509 newVector.addElement(new Option(
510 "\tTurns debugging mode on.",
511 "d", 0, "-d"));
512 newVector.addElement(new Option(
513 "\tUse keyphrase frequency statistic.",
514 "k", 0, "-k"));
515 newVector.addElement(new Option(
516 "\tDisallow internal periods.",
517 "p", 0, "-p"));
518 newVector.addElement(new Option(
519 "\tSets the maximum phrase length (default: 3).",
520 "x", 1, "-x <length>"));
521 newVector.addElement(new Option(
522 "\tSets the minimum phrase length (default: 1).",
523 "y", 1, "-y <length>"));
524 newVector.addElement(new Option(
525 "\tSet the minimum number of occurences (default: 2).",
526 "o", 1, "-o"));
527 newVector.addElement(new Option(
528 "\tSets the list of stopwords to use (default: StopwordsEnglish).",
529 "s", 1, "-s <name of stopwords class>"));
530 newVector.addElement(new Option(
531 "\tSet the stemmer to use (default: IteratedLovinsStemmer).",
532 "t", 1, "-t <name of stemmer class>"));
533 newVector.addElement(new Option(
534 "\tDo not check for proper nouns.",
535 "n", 0, "-n"));
536
537 return newVector.elements();
538 }
539
540 /**
541 * Collects the stems of the file names.
542 */
543 public Hashtable collectStems() throws Exception {
544
545 Hashtable stems = new Hashtable();
546
547 try {
548 File dir = new File(m_dirName);
549 String[] files = dir.list();
550 for (int i = 0; i < files.length; i++) {
551 if (files[i].endsWith(".key") ||
552 files[i].endsWith(".txt")) {
553 String stem = files[i].substring(0, files[i].length() - 4);
554 if (!stems.containsKey(stem)) {
555 stems.put(stem, new Double(0));
556 }
557 }
558 }
559 } catch (Exception e) {
560 throw new Exception("Problem opening directory " + m_dirName);
561 }
562 return stems;
563 }
564
565 /**
566 * Builds the model from the files
567 */
568 public void buildModel(Hashtable stems) throws Exception {
569
570 // Check whether there is actually any data
571 if (stems.size() == 0) {
572 throw new Exception("Couldn't find any data!");
573 }
574
575 FastVector atts = new FastVector(2);
576 atts.addElement(new Attribute("doc", null));
577 atts.addElement(new Attribute("keyphrases", null));
578 Instances data = new Instances("keyphrase_training_data", atts, 0);
579
580 // Build model
581 m_KEAFilter = new KEAFilter();
582 m_KEAFilter.setDebug(m_debug);
583 m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
584 m_KEAFilter.setKFused(getUseKFrequency());
585 m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
586 m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
587 m_KEAFilter.setMinNumOccur(getMinNumOccur());
588 m_KEAFilter.setInputFormat(data);
589 m_KEAFilter.setStemmer(getStemmer());
590 m_KEAFilter.setStopwords(getStopwords());
591 m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
592 Enumeration elem = stems.keys();
593 while (elem.hasMoreElements()) {
594 String str = (String)elem.nextElement();
595 double[] newInst = new double[2];
596 try {
597 File txt = new File(m_dirName + "/" + str + ".txt");
598 InputStreamReader is;
599 if (!m_encoding.equals("default")) {
600 is = new InputStreamReader(new FileInputStream(txt), m_encoding);
601 } else {
602 is = new InputStreamReader(new FileInputStream(txt));
603 }
604 StringBuffer txtStr = new StringBuffer();
605 int c;
606 while ((c = is.read()) != -1) {
607 txtStr.append((char)c);
608 }
609 newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
610 } catch (Exception e) {
611 if (m_debug) {
612 System.err.println("Can't find document for stem " + str + ".");
613 }
614 newInst[0] = Instance.missingValue();
615 }
616 try {
617 File key = new File(m_dirName + "/" + str + ".key");
618 InputStreamReader is;
619 if (!m_encoding.equals("default")) {
620 is = new InputStreamReader(new FileInputStream(key), m_encoding);
621 } else {
622 is = new InputStreamReader(new FileInputStream(key));
623 }
624 StringBuffer keyStr = new StringBuffer();
625 int c;
626 while ((c = is.read()) != -1) {
627 keyStr.append((char)c);
628 }
629 newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
630 } catch (Exception e) {
631 if (m_debug) {
632 System.err.println("Can't find keyphrases for stem " + str + ".");
633 }
634 newInst[1] = Instance.missingValue();
635 }
636 data.add(new Instance(1.0, newInst));
637 m_KEAFilter.input(data.instance(0));
638 data = data.stringFreeStructure();
639 }
640 m_KEAFilter.batchFinished();
641
642 // Get rid of instances in filter
643 Instance dummy;
644 while ((dummy = m_KEAFilter.output()) != null) {};
645 }
646
647 /**
648 * Saves the extraction model to the file.
649 */
650 public void saveModel() throws Exception {
651
652 BufferedOutputStream bufferedOut =
653 new BufferedOutputStream(new FileOutputStream(m_modelName));
654 ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
655 out.writeObject(m_KEAFilter);
656 out.flush();
657 out.close();
658 }
659
660 /**
661 * The main method.
662 */
663 public static void main(String[] ops) {
664
665 KEAModelBuilder kmb = new KEAModelBuilder();
666 try {
667 kmb.setOptions(ops);
668 System.err.print("Building model with options: ");
669 String[] optionSettings = kmb.getOptions();
670 for (int i = 0; i < optionSettings.length; i++) {
671 System.err.print(optionSettings[i] + " ");
672 }
673 System.err.println();
674 kmb.buildModel(kmb.collectStems());
675 kmb.saveModel();
676 } catch (Exception e) {
677 e.printStackTrace();
678 System.err.println(e.getMessage());
679 System.err.println("\nOptions:\n");
680 Enumeration enum = kmb.listOptions();
681 while (enum.hasMoreElements()) {
682 Option option = (Option) enum.nextElement();
683 System.err.println(option.synopsis());
684 System.err.println(option.description());
685 }
686 }
687 }
688}
689
Note: See TracBrowser for help on using the repository browser.