source: trunk/gsdl/packages/kea/kea-3.0/KEAFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 41.3 KB
Line 
1/*
2 * KEAFilter.java
3 * Copyright (C) 2000, 2001 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20import java.util.*;
21import java.io.*;
22
23import weka.core.*;
24import weka.filters.*;
25import weka.classifiers.*;
26
27/**
28 * This filter converts the incoming data into data appropriate for
29 * keyphrase classification. It assumes that the dataset contains two
30 * string attributes. The first attribute should contain the text of a
31 * document. The second attribute should contain the keyphrases
32 * associated with that document (if present).
33 *
34 * The filter converts every instance (i.e. document) into a set of
35 * instances, one for each word-based n-gram in the document. The
36 * string attribute representing the document is replaced by some
37 * numeric features, the estimated probability of each n-gram being a
38 * keyphrase, and the rank of this phrase in the document according to
39 * the probability. Each new instances also has a class value
40 * associated with it. The class is "true" if the n-gram is a true
41 * keyphrase, and "false" otherwise. Of course, if the input document
42 * doesn't come with author-assigned keyphrases, the class values for
43 * that document will be missing.
44 *
45 * @author Eibe Frank ([email protected])
46 * @version 1.0
47 */
48public class KEAFilter extends Filter implements OptionHandler {
49
50 /** Index of attribute containing the documents */
51 private int m_DocumentAtt = 0;
52
53 /** Index of attribute containing the keyphrases */
54 private int m_KeyphrasesAtt = 1;
55
56 /** The maximum length of phrases */
57 private int m_MaxPhraseLength = 3;
58
59 /** The minimum length of phrases */
60 private int m_MinPhraseLength = 1;
61
62 /** Is keyphrase frequency attribute being used? */
63 private boolean m_KFused = false;
64
65 /** Flag for debugging mode */
66 private boolean m_Debug = false;
67
68 /** Determines whether internal periods are allowed */
69 private boolean m_DisallowInternalPeriods = false;
70
71 /** The minimum number of occurences of a phrase */
72 private int m_MinNumOccur = 2;
73
74 /** The number of features describing a phrase */
75 private int m_NumFeatures = 2;
76
77 /* Indices of attributes in m_ClassifierData */
78 private int m_TfidfIndex = 0;
79 private int m_FirstOccurIndex = 1;
80 private int m_KeyFreqIndex = 2;
81
82 /** The punctuation filter used by this filter */
83 private KEAPhraseFilter m_PunctFilter = null;
84
85 /** The numbers filter used by this filter */
86 private NumbersFilter m_NumbersFilter = null;
87
88 /** The actual classifier used to compute probabilities */
89 private DistributionClassifier m_Classifier = null;
90
91 /** The dictionary containing the document frequencies */
92 private HashMap m_Dictionary = null;
93
94 /** The dictionary containing the keyphrases */
95 private HashMap m_KeyphraseDictionary = null;
96
97 /** The number of documents in the global frequencies corpus */
98 private int m_NumDocs = 0;
99
100 /** Template for the classifier data */
101 private Instances m_ClassifierData = null;
102
103 /** The stemmer to be used */
104 private Stemmer m_Stemmer = new IteratedLovinsStemmer();
105
106 /** The list of stop words to be used */
107 private Stopwords m_Stopwords = new StopwordsEnglish();
108
109 /** Determines whether check for proper nouns is performed */
110 private boolean m_CheckForProperNouns = true;
111
112 /**
113 * Get the M_CheckProperNouns value.
114 * @return the M_CheckProperNouns value.
115 */
116 public boolean getCheckForProperNouns() {
117 return m_CheckForProperNouns;
118 }
119
120 /**
121 * Set the M_CheckProperNouns value.
122 * @param newM_CheckProperNouns The new M_CheckProperNouns value.
123 */
124 public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
125 this.m_CheckForProperNouns = newM_CheckProperNouns;
126 }
127
128 /**
129 * Get the M_Stopwords value.
130 * @return the M_Stopwords value.
131 */
132 public Stopwords getStopwords() {
133 return m_Stopwords;
134 }
135
136 /**
137 * Set the M_Stopwords value.
138 * @param newM_Stopwords The new M_Stopwords value.
139 */
140 public void setStopwords(Stopwords newM_Stopwords) {
141 this.m_Stopwords = newM_Stopwords;
142 }
143
144
145 /**
146 * Get the Stemmer value.
147 * @return the Stemmer value.
148 */
149 public Stemmer getStemmer() {
150
151 return m_Stemmer;
152 }
153
154 /**
155 * Set the Stemmer value.
156 * @param newStemmer The new Stemmer value.
157 */
158 public void setStemmer(Stemmer newStemmer) {
159
160 this.m_Stemmer = newStemmer;
161 }
162
163 /**
164 * Get the value of MinNumOccur.
165 *
166 * @return Value of MinNumOccur.
167 */
168 public int getMinNumOccur() {
169
170 return m_MinNumOccur;
171 }
172
173 /**
174 * Set the value of MinNumOccur.
175 *
176 * @param newMinNumOccur Value to assign to MinNumOccur.
177 */
178 public void setMinNumOccur(int newMinNumOccur) {
179
180 m_MinNumOccur = newMinNumOccur;
181 }
182
183 /**
184 * Get the value of MaxPhraseLength.
185 *
186 * @return Value of MaxPhraseLength.
187 */
188 public int getMaxPhraseLength() {
189
190 return m_MaxPhraseLength;
191 }
192
193 /**
194 * Set the value of MaxPhraseLength.
195 *
196 * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
197 */
198 public void setMaxPhraseLength(int newMaxPhraseLength) {
199
200 m_MaxPhraseLength = newMaxPhraseLength;
201 }
202
203 /**
204 * Get the value of MinPhraseLength.
205 *
206 * @return Value of MinPhraseLength.
207 */
208 public int getMinPhraseLength() {
209
210 return m_MinPhraseLength;
211 }
212
213 /**
214 * Set the value of MinPhraseLength.
215 *
216 * @param newMinPhraseLength Value to assign to MinPhraseLength.
217 */
218 public void setMinPhraseLength(int newMinPhraseLength) {
219
220 m_MinPhraseLength = newMinPhraseLength;
221 }
222
223 /**
224 * Returns the index of the stemmed phrases in the output ARFF file.
225 */
226 public int getStemmedPhraseIndex() {
227
228 return m_DocumentAtt;
229 }
230
231 /**
232 * Returns the index of the unstemmed phrases in the output ARFF file.
233 */
234 public int getUnstemmedPhraseIndex() {
235
236 return m_DocumentAtt + 1;
237 }
238
239 /**
240 * Returns the index of the phrases' probabilities in the output ARFF file.
241 */
242 public int getProbabilityIndex() {
243
244 int index = m_DocumentAtt + 4;
245
246 if (m_Debug) {
247 if (m_KFused) {
248 index++;
249 }
250 }
251 return index;
252 }
253
254 /**
255 * Returns the index of the phrases' ranks in the output ARFF file.
256 */
257 public int getRankIndex() {
258
259 return getProbabilityIndex() + 1;
260 }
261
262 /**
263 * Get the value of DocumentAtt.
264 *
265 * @return Value of DocumentAtt.
266 */
267 public int getDocumentAtt() {
268
269 return m_DocumentAtt;
270 }
271
272 /**
273 * Set the value of DocumentAtt.
274 *
275 * @param newDocumentAtt Value to assign to DocumentAtt.
276 */
277 public void setDocumentAtt(int newDocumentAtt) {
278
279 m_DocumentAtt = newDocumentAtt;
280 }
281
282 /**
283 * Get the value of KeyphraseAtt.
284 *
285 * @return Value of KeyphraseAtt.
286 */
287 public int getKeyphrasesAtt() {
288
289 return m_KeyphrasesAtt;
290 }
291
292 /**
293 * Set the value of KeyphrasesAtt.
294 *
295 * @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
296 */
297 public void setKeyphrasesAtt(int newKeyphrasesAtt) {
298
299 m_KeyphrasesAtt = newKeyphrasesAtt;
300 }
301
302
303 /**
304 * Get the value of Debug.
305 *
306 * @return Value of Debug.
307 */
308 public boolean getDebug() {
309
310 return m_Debug;
311 }
312
313 /**
314 * Set the value of Debug.
315 *
316 * @param newDebug Value to assign to Debug.
317 */
318 public void setDebug(boolean newDebug) {
319
320 m_Debug = newDebug;
321 }
322
323 /**
324 * Sets whether keyphrase frequency attribute is used.
325 */
326 public void setKFused(boolean flag) {
327
328 m_KFused = flag;
329 if (flag) {
330 m_NumFeatures = 3;
331 } else {
332 m_NumFeatures = 2;
333 }
334 }
335
336 /**
337 * Gets whether keyphrase frequency attribute is used.
338 */
339 public boolean getKFused() {
340
341 return m_KFused;
342 }
343
344 /**
345 * Get whether the supplied columns are to be processed
346 *
347 * @return true if the supplied columns won't be processed
348 */
349 public boolean getDisallowInternalPeriods() {
350
351 return m_DisallowInternalPeriods;
352 }
353
354 /**
355 * Set whether selected columns should be processed. If true the
356 * selected columns won't be processed.
357 *
358 * @param invert the new invert setting
359 */
360 public void setDisallowInternalPeriods(boolean disallow) {
361
362 m_DisallowInternalPeriods = disallow;
363 }
364
365 /**
366 * Parses a given list of options controlling the behaviour of this object.
367 * Valid options are:<p>
368 *
369 * -K<br>
370 * Specifies whether keyphrase frequency statistic is used.<p>
371 *
372 * -M length<br>
373 * Sets the maximum phrase length (default: 3).<p>
374 *
375 * -L length<br>
376 * Sets the minimum phrase length (default: 1).<p>
377 *
378 * -D<br>
379 * Turns debugging mode on.<p>
380 *
381 * -I index<br>
382 * Sets the index of the attribute containing the documents (default: 0).<p>
383 *
384 * -J index<br>
385 * Sets the index of the attribute containing the keyphrases (default: 1).<p>
386 *
387 * -P<br>
388 * Disallow internal periods <p>
389 *
390 * -O number<br>
391 * The minimum number of times a phrase needs to occur (default: 2). <p>
392 *
393 * @param options the list of options as an array of strings
394 * @exception Exception if an option is not supported
395 */
396 public void setOptions(String[] options) throws Exception {
397
398 setKFused(Utils.getFlag('K', options));
399 setDebug(Utils.getFlag('D', options));
400 String docAttIndexString = Utils.getOption('I', options);
401 if (docAttIndexString.length() > 0) {
402 setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
403 } else {
404 setDocumentAtt(0);
405 }
406 String keyphraseAttIndexString = Utils.getOption('J', options);
407 if (keyphraseAttIndexString.length() > 0) {
408 setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
409 } else {
410 setKeyphrasesAtt(1);
411 }
412 String maxPhraseLengthString = Utils.getOption('M', options);
413 if (maxPhraseLengthString.length() > 0) {
414 setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
415 } else {
416 setMaxPhraseLength(3);
417 }
418 String minPhraseLengthString = Utils.getOption('M', options);
419 if (minPhraseLengthString.length() > 0) {
420 setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
421 } else {
422 setMinPhraseLength(1);
423 }
424 String minNumOccurString = Utils.getOption('O', options);
425 if (minNumOccurString.length() > 0) {
426 setMinNumOccur(Integer.parseInt(minNumOccurString));
427 } else {
428 setMinNumOccur(2);
429 }
430 setDisallowInternalPeriods(Utils.getFlag('P', options));
431 }
432
433 /**
434 * Gets the current settings of the filter.
435 *
436 * @return an array of strings suitable for passing to setOptions
437 */
438 public String [] getOptions() {
439
440 String [] options = new String [13];
441 int current = 0;
442
443 if (getKFused()) {
444 options[current++] = "-K";
445 }
446 if (getDebug()) {
447 options[current++] = "-D";
448 }
449 options[current++] = "-I";
450 options[current++] = "" + (getDocumentAtt() + 1);
451 options[current++] = "-J";
452 options[current++] = "" + (getKeyphrasesAtt() + 1);
453 options[current++] = "-M";
454 options[current++] = "" + (getMaxPhraseLength());
455 options[current++] = "-L";
456 options[current++] = "" + (getMinPhraseLength());
457 options[current++] = "-O";
458 options[current++] = "" + (getMinNumOccur());
459
460 if (getDisallowInternalPeriods()) {
461 options[current++] = "-P";
462 }
463
464 while (current < options.length) {
465 options[current++] = "";
466 }
467 return options;
468 }
469
470 /**
471 * Returns an enumeration describing the available options
472 *
473 * @return an enumeration of all the available options
474 */
475 public Enumeration listOptions() {
476
477 Vector newVector = new Vector(7);
478
479 newVector.addElement(new Option(
480 "\tSpecifies whether keyphrase frequency statistic is used.",
481 "K", 0, "-K"));
482 newVector.addElement(new Option(
483 "\tSets the maximum phrase length (default: 3).",
484 "M", 1, "-M <length>"));
485 newVector.addElement(new Option(
486 "\tSets the minimum phrase length (default: 1).",
487 "L", 1, "-L <length>"));
488 newVector.addElement(new Option(
489 "\tTurns debugging mode on.",
490 "D", 0, "-D"));
491 newVector.addElement(new Option(
492 "\tSets the index of the document attribute (default: 0).",
493 "I", 1, "-I"));
494 newVector.addElement(new Option(
495 "\tSets the index of the keyphrase attribute (default: 1).",
496 "J", 1, "-J"));
497 newVector.addElement(new Option(
498 "\tDisallow internal periods.",
499 "P", 0, "-P"));
500 newVector.addElement(new Option(
501 "\tSet the minimum number of occurences (default: 2).",
502 "O", 1, "-O"));
503
504 return newVector.elements();
505 }
506
507 /**
508 * Returns a string describing this filter
509 *
510 * @return a description of the filter suitable for
511 * displaying in the explorer/experimenter gui
512 */
513 public String globalInfo() {
514 return "Converts incoming data into data appropriate for " +
515 "keyphrase classification.";
516 }
517
518 /**
519 * Sets the format of the input instances.
520 *
521 * @param instanceInfo an Instances object containing the input
522 * instance structure (any instances contained in the object are
523 * ignored - only the structure is required).
524 * @return true if the outputFormat may be collected immediately
525 */
526 public boolean setInputFormat(Instances instanceInfo) throws Exception {
527
528 if (instanceInfo.classIndex() >= 0) {
529 throw new Exception("Don't know what do to if class index set!");
530 }
531 if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() ||
532 !instanceInfo.attribute(m_DocumentAtt).isString()) {
533 throw new Exception("Keyphrase attribute and document attribute " +
534 "need to be string attributes.");
535 }
536 m_PunctFilter = new KEAPhraseFilter();
537 int[] arr = new int[1];
538 arr[0] = m_DocumentAtt;
539 m_PunctFilter.setAttributeIndicesArray(arr);
540 m_PunctFilter.setInputFormat(instanceInfo);
541 m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
542 m_NumbersFilter = new NumbersFilter();
543 m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
544 super.setInputFormat(m_NumbersFilter.getOutputFormat());
545 return false;
546 }
547
548 /**
549 * Input an instance for filtering. Ordinarily the instance is processed
550 * and made available for output immediately. Some filters require all
551 * instances be read before producing output.
552 *
553 * @param instance the input instance
554 * @return true if the filtered instance may now be
555 * collected with output().
556 * @exception Exception if the input instance was not of the correct
557 * format or if there was a problem with the filtering.
558 */
559 public boolean input(Instance instance) throws Exception {
560
561 if (getInputFormat() == null) {
562 throw new Exception("No input instance format defined");
563 }
564 if (m_NewBatch) {
565 resetQueue();
566 m_NewBatch = false;
567 }
568
569 if (m_Debug) {
570 System.err.println("-- Reading instance");
571 }
572
573 m_PunctFilter.input(instance);
574 m_PunctFilter.batchFinished();
575 instance = m_PunctFilter.output();
576
577 m_NumbersFilter.input(instance);
578 m_NumbersFilter.batchFinished();
579 instance = m_NumbersFilter.output();
580
581 if (m_Dictionary == null) {
582 bufferInput(instance);
583 return false;
584 } else {
585 FastVector vector = convertInstance(instance, false);
586 Enumeration enum = vector.elements();
587 while (enum.hasMoreElements()) {
588 Instance inst = (Instance)enum.nextElement();
589 push(inst);
590 }
591 return true;
592 }
593 }
594
595 /**
596 * Signify that this batch of input to the filter is finished.
597 * If the filter requires all instances prior to filtering,
598 * output() may now be called to retrieve the filtered instances.
599 *
600 * @return true if there are instances pending output
601 * @exception Exception if no input structure has been defined
602 */
603 public boolean batchFinished() throws Exception {
604
605 if (getInputFormat() == null) {
606 throw new Exception("No input instance format defined");
607 }
608 if (m_Dictionary == null) {
609 buildGlobalDictionaries();
610 buildClassifier();
611 convertPendingInstances();
612 }
613 flushInput();
614 m_NewBatch = true;
615 return (numPendingOutput() != 0);
616 }
617
618 /**
619 * Builds the global dictionaries.
620 */
621 private void buildGlobalDictionaries() throws Exception {
622
623 if (m_Debug) {
624 System.err.println("--- Building global dictionaries");
625 }
626
627 // Build dictionary of n-grams with associated
628 // document frequencies
629 m_Dictionary = new HashMap();
630 for (int i = 0; i < getInputFormat().numInstances(); i++) {
631 String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
632 HashMap hash = getPhrasesForDictionary(str);
633 Iterator it = hash.keySet().iterator();
634 while (it.hasNext()) {
635 String phrase = (String)it.next();
636 Counter counter = (Counter)m_Dictionary.get(phrase);
637 if (counter == null) {
638 m_Dictionary.put(phrase, new Counter());
639 } else {
640 counter.increment();
641 }
642 }
643 }
644
645 if (m_KFused) {
646
647 // Build dictionary of n-grams that occur as keyphrases
648 // with associated keyphrase frequencies
649 m_KeyphraseDictionary = new HashMap();
650 for (int i = 0; i < getInputFormat().numInstances(); i++) {
651 String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
652 HashMap hash = getGivenKeyphrases(str, false);
653 if (hash != null) {
654 Iterator it = hash.keySet().iterator();
655 while (it.hasNext()) {
656 String phrase = (String)it.next();
657 Counter counter = (Counter)m_KeyphraseDictionary.get(phrase);
658 if (counter == null) {
659 m_KeyphraseDictionary.put(phrase, new Counter());
660 } else {
661 counter.increment();
662 }
663 }
664 }
665 }
666 } else {
667 m_KeyphraseDictionary = null;
668 }
669
670 // Set the number of documents in the global corpus
671 m_NumDocs = getInputFormat().numInstances();
672 }
673
674 /**
675 * Builds the classifier.
676 */
677 private void buildClassifier() throws Exception {
678
679 // Generate input format for classifier
680 FastVector atts = new FastVector();
681 for (int i = 0; i < getInputFormat().numAttributes(); i++) {
682 if (i == m_DocumentAtt) {
683 atts.addElement(new Attribute("TFxIDF"));
684 atts.addElement(new Attribute("First_occurrence"));
685 if (m_KFused) {
686 atts.addElement(new Attribute("Keyphrase_frequency"));
687 }
688 } else if (i == m_KeyphrasesAtt) {
689 FastVector vals = new FastVector(2);
690 vals.addElement("False");
691 vals.addElement("True");
692 atts.addElement(new Attribute("Keyphrase?", vals));
693 }
694 }
695 m_ClassifierData = new Instances("ClassifierData", atts, 0);
696 m_ClassifierData.setClassIndex(m_NumFeatures);
697
698 if (m_Debug) {
699 System.err.println("--- Converting instances for classifier");
700 }
701
702 // Convert pending input instances into data for classifier
703 for(int i = 0; i < getInputFormat().numInstances(); i++) {
704 Instance current = getInputFormat().instance(i);
705
706 // Get the key phrases for the document
707 String keyphrases = current.stringValue(m_KeyphrasesAtt);
708 HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
709 HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);
710
711 // Get the phrases for the document
712 HashMap hash = new HashMap();
713 int length = getPhrases(hash, current.stringValue(m_DocumentAtt));
714
715 // Compute the feature values for each phrase and
716 // add the instance to the data for the classifier
717 Iterator it = hash.keySet().iterator();
718 while (it.hasNext()) {
719 String phrase = (String)it.next();
720 FastVector phraseInfo = (FastVector)hash.get(phrase);
721 double[] vals = featVals(phrase, phraseInfo, true,
722 hashKeysEval, hashKeyphrases, length);
723 Instance inst = new Instance(current.weight(), vals);
724 m_ClassifierData.add(inst);
725 }
726 }
727
728 if (m_Debug) {
729 System.err.println("--- Building classifier");
730 }
731
732 // Build classifier
733 FilteredClassifier fclass = new FilteredClassifier();
734 fclass.setClassifier(new weka.classifiers.NaiveBayesSimple());
735 fclass.setFilter(new DiscretizeFilter());
736 m_Classifier = fclass;
737 m_Classifier.buildClassifier(m_ClassifierData);
738
739 if (m_Debug) {
740 System.err.println(m_Classifier);
741 }
742
743 // Save space
744 m_ClassifierData = new Instances(m_ClassifierData, 0);
745 }
746
747 /**
748 * Conmputes the feature values for a given phrase.
749 */
750 private double[] featVals(String phrase, FastVector phraseInfo,
751 boolean training, HashMap hashKeysEval,
752 HashMap hashKeyphrases, int length) {
753
754 // Compute feature values
755 Counter counterLocal = (Counter)phraseInfo.elementAt(1);
756 double[] newInst = new double[m_NumFeatures + 1];
757
758 // Compute TFxIDF
759 Counter counterGlobal = (Counter)m_Dictionary.get(phrase);
760 double localVal = counterLocal.value(), globalVal = 0;
761 if (counterGlobal != null) {
762 globalVal = counterGlobal.value();
763 if (training) {
764 globalVal = globalVal - 1;
765 }
766 }
767
768 // Just devide by length to get approximation of probability
769 // that phrase in document is our phrase
770 newInst[m_TfidfIndex] = (localVal / ((double)length)) *
771 (-Math.log((globalVal + 1)/ ((double)m_NumDocs + 1)));
772
773 // Compute first occurrence
774 Counter counterFirst = (Counter)phraseInfo.elementAt(0);
775 newInst[m_FirstOccurIndex] = (double)counterFirst.value() /
776 (double)length;
777
778 // Is keyphrase frequency attribute being used?
779 if (m_KFused) {
780 Counter keyphraseC = (Counter)m_KeyphraseDictionary.get(phrase);
781 if ((training) && (hashKeyphrases != null) &&
782 (hashKeyphrases.containsKey(phrase))) {
783 newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
784 } else {
785 if (keyphraseC != null) {
786 newInst[m_KeyFreqIndex] = keyphraseC.value();
787 } else {
788 newInst[m_KeyFreqIndex] = 0;
789 }
790 }
791 }
792
793 // Compute class value
794 String phraseInEvalFormat = evalFormat((String)phraseInfo.elementAt(2));
795 if (hashKeysEval == null) { // no author-assigned keyphrases
796 newInst[m_NumFeatures] = Instance.missingValue();
797 } else if (!hashKeysEval.containsKey(phraseInEvalFormat)) {
798 newInst[m_NumFeatures] = 0; // No keyphrase
799 } else {
800 hashKeysEval.remove(phraseInEvalFormat);
801 newInst[m_NumFeatures] = 1; // Keyphrase
802 }
803 return newInst;
804 }
805
806 /**
807 * Sets output format and converts pending input instances.
808 */
809 private void convertPendingInstances() throws Exception {
810
811 if (m_Debug) {
812 System.err.println("--- Converting pending instances");
813 }
814
815 // Create output format for filter
816 FastVector atts = new FastVector();
817 for (int i = 0; i < getInputFormat().numAttributes(); i++) {
818 if (i == m_DocumentAtt) {
819 atts.addElement(new Attribute("N-gram", null));
820 atts.addElement(new Attribute("N-gram-original", null));
821 atts.addElement(new Attribute("TFxIDF"));
822 atts.addElement(new Attribute("First_occurrence"));
823 if (m_Debug) {
824 if (m_KFused) {
825 atts.addElement(new Attribute("Keyphrase_frequency"));
826 }
827 }
828 atts.addElement(new Attribute("Probability"));
829 atts.addElement(new Attribute("Rank"));
830 } else if (i == m_KeyphrasesAtt) {
831 FastVector vals = new FastVector(2);
832 vals.addElement("False");
833 vals.addElement("True");
834 atts.addElement(new Attribute("Keyphrase?", vals));
835 } else {
836 atts.addElement(getInputFormat().attribute(i));
837 }
838 }
839 Instances outFormat = new Instances("KEAdata", atts, 0);
840 setOutputFormat(outFormat);
841
842 // Convert pending input instances into output data
843 for(int i = 0; i < getInputFormat().numInstances(); i++) {
844 Instance current = getInputFormat().instance(i);
845 FastVector vector = convertInstance(current, true);
846 Enumeration enum = vector.elements();
847 while (enum.hasMoreElements()) {
848 Instance inst = (Instance)enum.nextElement();
849 push(inst);
850 }
851 }
852 }
853
854 /**
855 * Converts an instance.
856 */
857 private FastVector convertInstance(Instance instance, boolean training)
858 throws Exception {
859
860 FastVector vector = new FastVector();
861
862 if (m_Debug) {
863 System.err.println("-- Converting instance");
864 }
865
866 // Get the key phrases for the document
867 HashMap hashKeyphrases = null;
868 HashMap hashKeysEval = null;
869 if (!instance.isMissing(m_KeyphrasesAtt)) {
870 String keyphrases = instance.stringValue(m_KeyphrasesAtt);
871 hashKeyphrases = getGivenKeyphrases(keyphrases, false);
872 hashKeysEval = getGivenKeyphrases(keyphrases, true);
873 }
874
875 // Get the phrases for the document
876 HashMap hash = new HashMap();
877 int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
878
879 // Compute number of extra attributes
880 int numFeatures = 5;
881 if (m_Debug) {
882 if (m_KFused) {
883 numFeatures = numFeatures + 1;
884 }
885 }
886
887 // Set indices of key attributes
888 int phraseAttIndex = m_DocumentAtt;
889 int tfidfAttIndex = m_DocumentAtt + 2;
890 int distAttIndex = m_DocumentAtt + 3;
891 int probsAttIndex = m_DocumentAtt + numFeatures - 1;
892
893 // Go through the phrases and convert them into instances
894 Iterator it = hash.keySet().iterator();
895 while (it.hasNext()) {
896 String phrase = (String)it.next();
897 FastVector phraseInfo = (FastVector)hash.get(phrase);
898 double[] vals = featVals(phrase, phraseInfo, training,
899 hashKeysEval, hashKeyphrases, length);
900 Instance inst = new Instance(instance.weight(), vals);
901 inst.setDataset(m_ClassifierData);
902
903 // Get probability of phrase being key phrase
904 double[] probs = m_Classifier.distributionForInstance(inst);
905 double prob = probs[1];
906
907 // Compute attribute values for final instance
908 double[] newInst =
909 new double[instance.numAttributes() + numFeatures];
910 int pos = 0;
911 for (int i = 0; i < instance.numAttributes(); i++) {
912 if (i == m_DocumentAtt) {
913
914 // Add phrase
915 int index = outputFormatPeek().attribute(pos).
916 addStringValue(phrase);
917 newInst[pos++] = index;
918
919 // Add original version
920 index = outputFormatPeek().attribute(pos).
921 addStringValue((String)phraseInfo.elementAt(2));
922 newInst[pos++] = index;
923
924 // Add TFxIDF
925 newInst[pos++] = inst.value(m_TfidfIndex);
926
927 // Add distance
928 newInst[pos++] = inst.value(m_FirstOccurIndex);
929
930 // Add other features
931 if (m_Debug) {
932 if (m_KFused) {
933 newInst[pos++] = inst.value(m_KeyFreqIndex);
934 }
935 }
936
937 // Add probability
938 probsAttIndex = pos;
939 newInst[pos++] = prob;
940
941 // Set rank to missing (computed below)
942 newInst[pos++] = Instance.missingValue();
943 } else if (i == m_KeyphrasesAtt) {
944 newInst[pos++] = inst.classValue();
945 } else {
946 newInst[pos++] = instance.value(i);
947 }
948 }
949 Instance ins = new Instance(instance.weight(), newInst);
950 ins.setDataset(outputFormatPeek());
951 vector.addElement(ins);
952 }
953
954 // Add dummy instances for keyphrases that don't occur
955 // in the document
956 if (hashKeysEval != null) {
957 Iterator phrases = hashKeysEval.keySet().iterator();
958 while (phrases.hasNext()) {
959 String phrase = (String)phrases.next();
960 double[] newInst =
961 new double[instance.numAttributes() + numFeatures];
962 int pos = 0;
963 for (int i = 0; i < instance.numAttributes(); i++) {
964 if (i == m_DocumentAtt) {
965
966 // Add phrase
967 int index = outputFormatPeek().attribute(pos).
968 addStringValue(phrase);
969 newInst[pos++] = (double)index;
970
971 // Add original version
972 index = outputFormatPeek().attribute(pos).
973 addStringValue((String)hashKeysEval.get(phrase));
974 newInst[pos++] = (double)index;
975
976 // Add TFxIDF
977 newInst[pos++] = Instance.missingValue();
978
979 // Add distance
980 newInst[pos++] = Instance.missingValue();
981
982 // Add other features
983 if (m_Debug) {
984 if (m_KFused) {
985 newInst[pos++] = Instance.missingValue();
986 }
987 }
988
989 // Add probability and rank
990 newInst[pos++] = -Double.MAX_VALUE;
991 newInst[pos++] = Instance.missingValue();
992 } else if (i == m_KeyphrasesAtt) {
993 newInst[pos++] = 1; // Keyphrase
994 } else {
995 newInst[pos++] = instance.value(i);
996 }
997 }
998 Instance inst = new Instance(instance.weight(), newInst);
999 inst.setDataset(outputFormatPeek());
1000 vector.addElement(inst);
1001 }
1002 }
1003
1004 // Sort phrases according to their distance (stable sort)
1005 double[] vals = new double[vector.size()];
1006 for (int i = 0; i < vals.length; i++) {
1007 vals[i] = ((Instance)vector.elementAt(i)).value(distAttIndex);
1008 }
1009 FastVector newVector = new FastVector(vector.size());
1010 int[] sortedIndices = Utils.stableSort(vals);
1011 for (int i = 0; i < vals.length; i++) {
1012 newVector.addElement(vector.elementAt(sortedIndices[i]));
1013 }
1014 vector = newVector;
1015
1016 // Sort phrases according to their tfxidf value (stable sort)
1017 for (int i = 0; i < vals.length; i++) {
1018 vals[i] = -((Instance)vector.elementAt(i)).value(tfidfAttIndex);
1019 }
1020 newVector = new FastVector(vector.size());
1021 sortedIndices = Utils.stableSort(vals);
1022 for (int i = 0; i < vals.length; i++) {
1023 newVector.addElement(vector.elementAt(sortedIndices[i]));
1024 }
1025 vector = newVector;
1026
1027 // Sort phrases according to their probability (stable sort)
1028 for (int i = 0; i < vals.length; i++) {
1029 vals[i] = 1 - ((Instance)vector.elementAt(i)).value(probsAttIndex);
1030 }
1031 newVector = new FastVector(vector.size());
1032 sortedIndices = Utils.stableSort(vals);
1033 for (int i = 0; i < vals.length; i++) {
1034 newVector.addElement(vector.elementAt(sortedIndices[i]));
1035 }
1036 vector = newVector;
1037
1038 // Compute rank of phrases. Check for subphrases that are ranked
1039 // lower than superphrases and assign probability -1 and set the
1040 // rank to Integer.MAX_VALUE
1041 int rank = 1;
1042 for (int i = 0; i < vals.length; i++) {
1043 Instance currentInstance = (Instance)vector.elementAt(i);
1044
1045 // Short cut: if phrase very unlikely make rank very low and continue
1046 if (Utils.grOrEq(vals[i], 1.0)) {
1047 currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
1048 continue;
1049 }
1050
1051 // Otherwise look for super phrase starting with first phrase
1052 // in list that has same probability, TFxIDF value, and distance as
1053 // current phrase. We do this to catch all superphrases
1054 // that have same probability, TFxIDF value and distance as current phrase.
1055 int startInd = i;
1056 while (startInd < vals.length) {
1057 Instance inst = (Instance)vector.elementAt(startInd);
1058 if ((inst.value(tfidfAttIndex) !=
1059 currentInstance.value(tfidfAttIndex)) ||
1060 (inst.value(probsAttIndex) !=
1061 currentInstance.value(probsAttIndex)) ||
1062 (inst.value(distAttIndex) !=
1063 currentInstance.value(distAttIndex))) {
1064 break;
1065 }
1066 startInd++;
1067 }
1068 String val = currentInstance.stringValue(phraseAttIndex);
1069 boolean foundSuperphrase = false;
1070 for (int j = startInd - 1; j >= 0; j--) {
1071 if (j != i) {
1072 Instance candidate = (Instance)vector.elementAt(j);
1073 String potSuperphrase = candidate.stringValue(phraseAttIndex);
1074 if (val.length() <= potSuperphrase.length()) {
1075 if (KEAFilter.contains(val, potSuperphrase)) {
1076 foundSuperphrase = true;
1077 break;
1078 }
1079 }
1080 }
1081 }
1082 if (foundSuperphrase) {
1083 currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
1084 } else {
1085 currentInstance.setValue(probsAttIndex + 1, rank++);
1086 }
1087 }
1088 return vector;
1089 }
1090
1091 /**
1092 * Checks whether one phrase is a subphrase of another phrase.
1093 */
1094 private static boolean contains(String sub, String sup) {
1095
1096 int i = 0;
1097 while (i + sub.length() - 1 < sup.length()) {
1098 int j;
1099 for (j = 0; j < sub.length(); j++) {
1100 if (sub.charAt(j) != sup.charAt(i + j)) {
1101 break;
1102 }
1103 }
1104 if (j == sub.length()) {
1105 if ((i + j) < sup.length()) {
1106 if (sup.charAt(i + j) == ' ') {
1107 return true;
1108 } else {
1109 return false;
1110 }
1111 } else {
1112 return true;
1113 }
1114 }
1115
1116 // Skip forward to next space
1117 do {
1118 i++;
1119 } while ((i < sup.length()) && (sup.charAt(i) != ' '));
1120 i++;
1121 }
1122 return false;
1123 }
1124
1125 /**
1126 * Returns a hashtable. Fills the hashtable
1127 * with the stemmed n-grams occuring in the given string
1128 * (as keys) and the number of times it occurs.
1129 */
1130 private HashMap getPhrasesForDictionary(String str) {
1131
1132 String[] buffer = new String[m_MaxPhraseLength];
1133 HashMap hash = new HashMap();
1134
1135 StringTokenizer tok = new StringTokenizer(str, "\n");
1136 while (tok.hasMoreTokens()) {
1137 String phrase = tok.nextToken();
1138 int numSeen = 0;
1139 StringTokenizer wordTok = new StringTokenizer(phrase, " ");
1140 while (wordTok.hasMoreTokens()) {
1141 String word = wordTok.nextToken();
1142
1143 // Store word in buffer
1144 for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
1145 buffer[i] = buffer[i + 1];
1146 }
1147 buffer[m_MaxPhraseLength - 1] = word;
1148
1149 // How many are buffered?
1150 numSeen++;
1151 if (numSeen > m_MaxPhraseLength) {
1152 numSeen = m_MaxPhraseLength;
1153 }
1154
1155 // Don't consider phrases that end with a stop word
1156 if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
1157 continue;
1158 }
1159
1160 // Loop through buffer and add phrases to hashtable
1161 StringBuffer phraseBuffer = new StringBuffer();
1162 for (int i = 1; i <= numSeen; i++) {
1163 if (i > 1) {
1164 phraseBuffer.insert(0, ' ');
1165 }
1166 phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
1167
1168 // Don't consider phrases that begin with a stop word
1169 if ((i > 1) &&
1170 (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
1171 continue;
1172 }
1173
1174 // Only consider phrases with minimum length
1175 if (i >= m_MinPhraseLength) {
1176
1177 // Stem string
1178 String orig = phraseBuffer.toString();
1179 String internal = internalFormat(orig);
1180 Counter count = (Counter)hash.get(internal);
1181 if (count == null) {
1182 hash.put(internal, new Counter());
1183 } else {
1184 count.increment();
1185 }
1186
1187 // Add components if phrase is single word before
1188 // conversion into internal format (i.e. error-correcting)
1189 /*if ((orig.indexOf(' ') == -1) &&
1190 (internal.indexOf(' ') != -1)) {
1191 StringTokenizer tokW = new StringTokenizer(internal, " ");
1192 while (tokW.hasMoreTokens()) {
1193 String comp = (String)tokW.nextToken();
1194 Counter countW = (Counter)hash.get(comp);
1195 if (countW == null) {
1196 hash.put(comp, new Counter());
1197 } else {
1198 countW.increment();
1199 }
1200 }
1201 }*/
1202 }
1203 }
1204 }
1205 }
1206 return hash;
1207 }
1208
1209 /**
1210 * Expects an empty hashtable. Fills the hashtable
1211 * with the stemmed n-grams occuring in the given string
1212 * (as keys). Stores the position, the number of occurences,
1213 * and the most commonly occurring orgininal version of
1214 * each n-gram.
1215 *
1216 * N-grams that occur less than m_MinNumOccur are not used.
1217 *
1218 * Returns the total number of words (!) in the string.
1219 */
1220 private int getPhrases(HashMap hash, String str) {
1221
1222 String[] buffer = new String[m_MaxPhraseLength];
1223
1224 StringTokenizer tok = new StringTokenizer(str, "\n");
1225 int pos = 1;
1226 while (tok.hasMoreTokens()) {
1227 String phrase = tok.nextToken();
1228 int numSeen = 0;
1229 StringTokenizer wordTok = new StringTokenizer(phrase, " ");
1230 while (wordTok.hasMoreTokens()) {
1231 String word = wordTok.nextToken();
1232
1233 // Store word in buffer
1234 for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
1235 buffer[i] = buffer[i + 1];
1236 }
1237 buffer[m_MaxPhraseLength - 1] = word;
1238
1239 // How many are buffered?
1240 numSeen++;
1241 if (numSeen > m_MaxPhraseLength) {
1242 numSeen = m_MaxPhraseLength;
1243 }
1244
1245 // Don't consider phrases that end with a stop word
1246 if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
1247 pos++;
1248 continue;
1249 }
1250
1251 // Loop through buffer and add phrases to hashtable
1252 StringBuffer phraseBuffer = new StringBuffer();
1253 for (int i = 1; i <= numSeen; i++) {
1254 if (i > 1) {
1255 phraseBuffer.insert(0, ' ');
1256 }
1257 phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
1258
1259 // Don't consider phrases that begin with a stop word
1260 if ((i > 1) &&
1261 (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
1262 continue;
1263 }
1264
1265 // Only consider phrases with minimum length
1266 if (i >= m_MinPhraseLength) {
1267
1268 // Stem string
1269 String phrStr = phraseBuffer.toString();
1270 String internal = internalFormat(phrStr);
1271 FastVector vec = (FastVector)hash.get(internal);
1272 if (vec == null) {
1273 vec = new FastVector(3);
1274
1275 // HashMap for storing all versions
1276 HashMap secHash = new HashMap();
1277 secHash.put(phrStr, new Counter());
1278
1279 // Update hashtable with all the info
1280 vec.addElement(new Counter(pos + 1 - i));
1281 vec.addElement(new Counter());
1282 vec.addElement(secHash);
1283 hash.put(internal, vec);
1284 } else {
1285
1286 // Update number of occurrences
1287 ((Counter)((FastVector)vec).elementAt(1)).increment();
1288
1289 // Update hashtable storing different versions
1290 HashMap secHash = (HashMap)vec.elementAt(2);
1291 Counter count = (Counter)secHash.get(phrStr);
1292 if (count == null) {
1293 secHash.put(phrStr, new Counter());
1294 } else {
1295 count.increment();
1296 }
1297 }
1298 }
1299 }
1300 pos++;
1301 }
1302 }
1303
1304 // Replace secondary hashtables with most commonly occurring
1305 // version of each phrase (canonical) form. Delete all words
1306 // that are proper nouns.
1307 Iterator phrases = hash.keySet().iterator();
1308 while (phrases.hasNext()) {
1309 String phrase = (String)phrases.next();
1310 FastVector info = (FastVector)hash.get(phrase);
1311
1312 // Occurring less than m_MinNumOccur?
1313 if (((Counter)((FastVector)info).elementAt(1)).value() < m_MinNumOccur) {
1314 phrases.remove();
1315 continue;
1316 }
1317
1318 // Get canonical form
1319 String canForm = canonicalForm((HashMap)info.elementAt(2));
1320 if (canForm == null) {
1321 phrases.remove();
1322 } else {
1323 info.setElementAt(canForm, 2);
1324 }
1325 }
1326 return pos;
1327 }
1328
1329 /**
1330 * Create canonical form of phrase.
1331 */
1332 private String canonicalForm(HashMap secHash) {
1333
1334 int max = 0; String bestVersion = null;
1335 boolean allFullyHyphenated = true;
1336 int num = 0;
1337 Iterator versions = secHash.keySet().iterator();
1338 while (versions.hasNext()) {
1339 num++;
1340 String version = (String)versions.next();
1341
1342 // Are all the words joined up?
1343 if (!isFullyHyphenated(version)) {
1344 allFullyHyphenated = false;
1345 }
1346
1347 // Check for how often this version occurs
1348 Counter count = (Counter)secHash.get(version);
1349 if (count.value() > max) {
1350 max = count.value();
1351 bestVersion = version;
1352 }
1353 }
1354 if ((getCheckForProperNouns()) && (num == 1) && properNoun(bestVersion)) {
1355 //if (allProperNouns) {
1356 return null;
1357 } else {
1358 if (isFullyHyphenated(bestVersion) &&
1359 !allFullyHyphenated) {
1360 bestVersion = bestVersion.replace('-', ' ');
1361 }
1362 return bestVersion;
1363 }
1364 }
1365
1366 /**
1367 * Checks whether the given phrase is
1368 * fully hyphenated.
1369 */
1370 private boolean isFullyHyphenated(String str) {
1371
1372 return (str.indexOf(' ') == -1);
1373 }
1374
1375 /**
1376 * Checks whether the given string is a
1377 * proper noun.
1378 *
1379 * @return true if it is a potential proper noun
1380 */
1381 private static boolean properNoun(String str) {
1382
1383 // Is it more than one word?
1384 if (str.indexOf(' ') != -1) {
1385 return false;
1386 }
1387
1388 // Does it start with an upper-case character?
1389 if (Character.isLowerCase(str.charAt(0))) {
1390 return false;
1391 }
1392
1393 // Is there at least one character that's
1394 // not upper-case?
1395 for (int i = 1; i < str.length(); i++) {
1396 /*if (Character.isUpperCase(str.charAt(i))) {
1397 return false;
1398 }*/
1399 if (!Character.isUpperCase(str.charAt(i))) {
1400 return true;
1401 }
1402 }
1403 //return true;
1404 return false;
1405 }
1406
1407 /**
1408 * Generates the evaluation format of a phrase.
1409 */
1410 private String evalFormat(String str) {
1411
1412 return m_Stemmer.stemString(str);
1413 }
1414
1415 /**
1416 * Generates the internal format of a phrase.
1417 */
1418 private String internalFormat(String str) {
1419
1420 // Remove some non-alphanumeric characters
1421 str = str.replace('-', ' ');
1422 str = str.replace('/', ' ');
1423 str = str.replace('&', ' ');
1424
1425 // Stem string
1426 return m_Stemmer.stemString(str);
1427 }
1428
1429 /**
1430 * Gets all the phrases in the given string and puts them into the
1431 * hashtable. Also stores the original version of the stemmed
1432 * phrase in the hash table.
1433 */
1434 private HashMap getGivenKeyphrases(String str,
1435 boolean forEval) {
1436
1437 FastVector vector = new FastVector();
1438 HashMap hash = new HashMap();
1439
1440 StringTokenizer tok = new StringTokenizer(str, "\n");
1441 while (tok.hasMoreTokens()) {
1442 String orig = tok.nextToken();
1443 orig = orig.trim();
1444 if (orig.length() > 0) {
1445 String modified;
1446 if (!forEval) {
1447 modified = internalFormat(orig);
1448 } else {
1449 modified = evalFormat(orig);
1450 }
1451 if (!hash.containsKey(modified)) {
1452 hash.put(modified, orig);
1453 } else {
1454 if (forEval) {
1455 System.err.println("WARNING: stem of author-assigned keyphrase " +
1456 orig + " matches existing stem (skipping it)!");
1457 }
1458 }
1459 }
1460 }
1461 if (hash.size() == 0) {
1462 return null;
1463 } else {
1464 return hash;
1465 }
1466 }
1467
1468 /**
1469 * Main method for testing this class.
1470 *
1471 * @param argv should contain arguments to the filter: use -h for help
1472 */
1473 public static void main(String [] argv) {
1474
1475 try {
1476 if (Utils.getFlag('b', argv)) {
1477 Filter.batchFilterFile(new KEAFilter(), argv);
1478 } else {
1479 Filter.filterFile(new KEAFilter(), argv);
1480 }
1481 } catch (Exception ex) {
1482 System.out.println(ex.getMessage());
1483 }
1484 }
1485}
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
Note: See TracBrowser for help on using the repository browser.