1 | /*
|
---|
2 | * KEAFilter.java
|
---|
3 | * Copyright (C) 2000, 2001 Eibe Frank
|
---|
4 | *
|
---|
5 | * This program is free software; you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
18 | */
|
---|
19 |
|
---|
20 | import java.util.*;
|
---|
21 | import java.io.*;
|
---|
22 |
|
---|
23 | import weka.core.*;
|
---|
24 | import weka.filters.*;
|
---|
25 | import weka.classifiers.*;
|
---|
26 |
|
---|
27 | /**
|
---|
28 | * This filter converts the incoming data into data appropriate for
|
---|
29 | * keyphrase classification. It assumes that the dataset contains two
|
---|
30 | * string attributes. The first attribute should contain the text of a
|
---|
31 | * document. The second attribute should contain the keyphrases
|
---|
32 | * associated with that document (if present).
|
---|
33 | *
|
---|
34 | * The filter converts every instance (i.e. document) into a set of
|
---|
35 | * instances, one for each word-based n-gram in the document. The
|
---|
36 | * string attribute representing the document is replaced by some
|
---|
37 | * numeric features, the estimated probability of each n-gram being a
|
---|
38 | * keyphrase, and the rank of this phrase in the document according to
|
---|
39 | * the probability. Each new instances also has a class value
|
---|
40 | * associated with it. The class is "true" if the n-gram is a true
|
---|
41 | * keyphrase, and "false" otherwise. Of course, if the input document
|
---|
42 | * doesn't come with author-assigned keyphrases, the class values for
|
---|
43 | * that document will be missing.
|
---|
44 | *
|
---|
45 | * @author Eibe Frank ([email protected])
|
---|
46 | * @version 1.0
|
---|
47 | */
|
---|
48 | public class KEAFilter extends Filter implements OptionHandler {
|
---|
49 |
|
---|
50 | /** Index of attribute containing the documents */
|
---|
51 | private int m_DocumentAtt = 0;
|
---|
52 |
|
---|
53 | /** Index of attribute containing the keyphrases */
|
---|
54 | private int m_KeyphrasesAtt = 1;
|
---|
55 |
|
---|
56 | /** The maximum length of phrases */
|
---|
57 | private int m_MaxPhraseLength = 3;
|
---|
58 |
|
---|
59 | /** The minimum length of phrases */
|
---|
60 | private int m_MinPhraseLength = 1;
|
---|
61 |
|
---|
62 | /** Is keyphrase frequency attribute being used? */
|
---|
63 | private boolean m_KFused = false;
|
---|
64 |
|
---|
65 | /** Flag for debugging mode */
|
---|
66 | private boolean m_Debug = false;
|
---|
67 |
|
---|
68 | /** Determines whether internal periods are allowed */
|
---|
69 | private boolean m_DisallowInternalPeriods = false;
|
---|
70 |
|
---|
71 | /** The minimum number of occurences of a phrase */
|
---|
72 | private int m_MinNumOccur = 2;
|
---|
73 |
|
---|
74 | /** The number of features describing a phrase */
|
---|
75 | private int m_NumFeatures = 2;
|
---|
76 |
|
---|
77 | /* Indices of attributes in m_ClassifierData */
|
---|
78 | private int m_TfidfIndex = 0;
|
---|
79 | private int m_FirstOccurIndex = 1;
|
---|
80 | private int m_KeyFreqIndex = 2;
|
---|
81 |
|
---|
82 | /** The punctuation filter used by this filter */
|
---|
83 | private KEAPhraseFilter m_PunctFilter = null;
|
---|
84 |
|
---|
85 | /** The numbers filter used by this filter */
|
---|
86 | private NumbersFilter m_NumbersFilter = null;
|
---|
87 |
|
---|
88 | /** The actual classifier used to compute probabilities */
|
---|
89 | private DistributionClassifier m_Classifier = null;
|
---|
90 |
|
---|
91 | /** The dictionary containing the document frequencies */
|
---|
92 | private HashMap m_Dictionary = null;
|
---|
93 |
|
---|
94 | /** The dictionary containing the keyphrases */
|
---|
95 | private HashMap m_KeyphraseDictionary = null;
|
---|
96 |
|
---|
97 | /** The number of documents in the global frequencies corpus */
|
---|
98 | private int m_NumDocs = 0;
|
---|
99 |
|
---|
100 | /** Template for the classifier data */
|
---|
101 | private Instances m_ClassifierData = null;
|
---|
102 |
|
---|
103 | /** The stemmer to be used */
|
---|
104 | private Stemmer m_Stemmer = new IteratedLovinsStemmer();
|
---|
105 |
|
---|
106 | /** The list of stop words to be used */
|
---|
107 | private Stopwords m_Stopwords = new StopwordsEnglish();
|
---|
108 |
|
---|
109 | /** Determines whether check for proper nouns is performed */
|
---|
110 | private boolean m_CheckForProperNouns = true;
|
---|
111 |
|
---|
112 | /**
|
---|
113 | * Get the M_CheckProperNouns value.
|
---|
114 | * @return the M_CheckProperNouns value.
|
---|
115 | */
|
---|
116 | public boolean getCheckForProperNouns() {
|
---|
117 | return m_CheckForProperNouns;
|
---|
118 | }
|
---|
119 |
|
---|
120 | /**
|
---|
121 | * Set the M_CheckProperNouns value.
|
---|
122 | * @param newM_CheckProperNouns The new M_CheckProperNouns value.
|
---|
123 | */
|
---|
124 | public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
|
---|
125 | this.m_CheckForProperNouns = newM_CheckProperNouns;
|
---|
126 | }
|
---|
127 |
|
---|
128 | /**
|
---|
129 | * Get the M_Stopwords value.
|
---|
130 | * @return the M_Stopwords value.
|
---|
131 | */
|
---|
132 | public Stopwords getStopwords() {
|
---|
133 | return m_Stopwords;
|
---|
134 | }
|
---|
135 |
|
---|
136 | /**
|
---|
137 | * Set the M_Stopwords value.
|
---|
138 | * @param newM_Stopwords The new M_Stopwords value.
|
---|
139 | */
|
---|
140 | public void setStopwords(Stopwords newM_Stopwords) {
|
---|
141 | this.m_Stopwords = newM_Stopwords;
|
---|
142 | }
|
---|
143 |
|
---|
144 |
|
---|
145 | /**
|
---|
146 | * Get the Stemmer value.
|
---|
147 | * @return the Stemmer value.
|
---|
148 | */
|
---|
149 | public Stemmer getStemmer() {
|
---|
150 |
|
---|
151 | return m_Stemmer;
|
---|
152 | }
|
---|
153 |
|
---|
154 | /**
|
---|
155 | * Set the Stemmer value.
|
---|
156 | * @param newStemmer The new Stemmer value.
|
---|
157 | */
|
---|
158 | public void setStemmer(Stemmer newStemmer) {
|
---|
159 |
|
---|
160 | this.m_Stemmer = newStemmer;
|
---|
161 | }
|
---|
162 |
|
---|
163 | /**
|
---|
164 | * Get the value of MinNumOccur.
|
---|
165 | *
|
---|
166 | * @return Value of MinNumOccur.
|
---|
167 | */
|
---|
168 | public int getMinNumOccur() {
|
---|
169 |
|
---|
170 | return m_MinNumOccur;
|
---|
171 | }
|
---|
172 |
|
---|
173 | /**
|
---|
174 | * Set the value of MinNumOccur.
|
---|
175 | *
|
---|
176 | * @param newMinNumOccur Value to assign to MinNumOccur.
|
---|
177 | */
|
---|
178 | public void setMinNumOccur(int newMinNumOccur) {
|
---|
179 |
|
---|
180 | m_MinNumOccur = newMinNumOccur;
|
---|
181 | }
|
---|
182 |
|
---|
183 | /**
|
---|
184 | * Get the value of MaxPhraseLength.
|
---|
185 | *
|
---|
186 | * @return Value of MaxPhraseLength.
|
---|
187 | */
|
---|
188 | public int getMaxPhraseLength() {
|
---|
189 |
|
---|
190 | return m_MaxPhraseLength;
|
---|
191 | }
|
---|
192 |
|
---|
193 | /**
|
---|
194 | * Set the value of MaxPhraseLength.
|
---|
195 | *
|
---|
196 | * @param newMaxPhraseLength Value to assign to MaxPhraseLength.
|
---|
197 | */
|
---|
198 | public void setMaxPhraseLength(int newMaxPhraseLength) {
|
---|
199 |
|
---|
200 | m_MaxPhraseLength = newMaxPhraseLength;
|
---|
201 | }
|
---|
202 |
|
---|
203 | /**
|
---|
204 | * Get the value of MinPhraseLength.
|
---|
205 | *
|
---|
206 | * @return Value of MinPhraseLength.
|
---|
207 | */
|
---|
208 | public int getMinPhraseLength() {
|
---|
209 |
|
---|
210 | return m_MinPhraseLength;
|
---|
211 | }
|
---|
212 |
|
---|
213 | /**
|
---|
214 | * Set the value of MinPhraseLength.
|
---|
215 | *
|
---|
216 | * @param newMinPhraseLength Value to assign to MinPhraseLength.
|
---|
217 | */
|
---|
218 | public void setMinPhraseLength(int newMinPhraseLength) {
|
---|
219 |
|
---|
220 | m_MinPhraseLength = newMinPhraseLength;
|
---|
221 | }
|
---|
222 |
|
---|
223 | /**
|
---|
224 | * Returns the index of the stemmed phrases in the output ARFF file.
|
---|
225 | */
|
---|
226 | public int getStemmedPhraseIndex() {
|
---|
227 |
|
---|
228 | return m_DocumentAtt;
|
---|
229 | }
|
---|
230 |
|
---|
231 | /**
|
---|
232 | * Returns the index of the unstemmed phrases in the output ARFF file.
|
---|
233 | */
|
---|
234 | public int getUnstemmedPhraseIndex() {
|
---|
235 |
|
---|
236 | return m_DocumentAtt + 1;
|
---|
237 | }
|
---|
238 |
|
---|
239 | /**
|
---|
240 | * Returns the index of the phrases' probabilities in the output ARFF file.
|
---|
241 | */
|
---|
242 | public int getProbabilityIndex() {
|
---|
243 |
|
---|
244 | int index = m_DocumentAtt + 4;
|
---|
245 |
|
---|
246 | if (m_Debug) {
|
---|
247 | if (m_KFused) {
|
---|
248 | index++;
|
---|
249 | }
|
---|
250 | }
|
---|
251 | return index;
|
---|
252 | }
|
---|
253 |
|
---|
254 | /**
|
---|
255 | * Returns the index of the phrases' ranks in the output ARFF file.
|
---|
256 | */
|
---|
257 | public int getRankIndex() {
|
---|
258 |
|
---|
259 | return getProbabilityIndex() + 1;
|
---|
260 | }
|
---|
261 |
|
---|
262 | /**
|
---|
263 | * Get the value of DocumentAtt.
|
---|
264 | *
|
---|
265 | * @return Value of DocumentAtt.
|
---|
266 | */
|
---|
267 | public int getDocumentAtt() {
|
---|
268 |
|
---|
269 | return m_DocumentAtt;
|
---|
270 | }
|
---|
271 |
|
---|
272 | /**
|
---|
273 | * Set the value of DocumentAtt.
|
---|
274 | *
|
---|
275 | * @param newDocumentAtt Value to assign to DocumentAtt.
|
---|
276 | */
|
---|
277 | public void setDocumentAtt(int newDocumentAtt) {
|
---|
278 |
|
---|
279 | m_DocumentAtt = newDocumentAtt;
|
---|
280 | }
|
---|
281 |
|
---|
282 | /**
|
---|
283 | * Get the value of KeyphraseAtt.
|
---|
284 | *
|
---|
285 | * @return Value of KeyphraseAtt.
|
---|
286 | */
|
---|
287 | public int getKeyphrasesAtt() {
|
---|
288 |
|
---|
289 | return m_KeyphrasesAtt;
|
---|
290 | }
|
---|
291 |
|
---|
292 | /**
|
---|
293 | * Set the value of KeyphrasesAtt.
|
---|
294 | *
|
---|
295 | * @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
|
---|
296 | */
|
---|
297 | public void setKeyphrasesAtt(int newKeyphrasesAtt) {
|
---|
298 |
|
---|
299 | m_KeyphrasesAtt = newKeyphrasesAtt;
|
---|
300 | }
|
---|
301 |
|
---|
302 |
|
---|
303 | /**
|
---|
304 | * Get the value of Debug.
|
---|
305 | *
|
---|
306 | * @return Value of Debug.
|
---|
307 | */
|
---|
308 | public boolean getDebug() {
|
---|
309 |
|
---|
310 | return m_Debug;
|
---|
311 | }
|
---|
312 |
|
---|
313 | /**
|
---|
314 | * Set the value of Debug.
|
---|
315 | *
|
---|
316 | * @param newDebug Value to assign to Debug.
|
---|
317 | */
|
---|
318 | public void setDebug(boolean newDebug) {
|
---|
319 |
|
---|
320 | m_Debug = newDebug;
|
---|
321 | }
|
---|
322 |
|
---|
323 | /**
|
---|
324 | * Sets whether keyphrase frequency attribute is used.
|
---|
325 | */
|
---|
326 | public void setKFused(boolean flag) {
|
---|
327 |
|
---|
328 | m_KFused = flag;
|
---|
329 | if (flag) {
|
---|
330 | m_NumFeatures = 3;
|
---|
331 | } else {
|
---|
332 | m_NumFeatures = 2;
|
---|
333 | }
|
---|
334 | }
|
---|
335 |
|
---|
336 | /**
|
---|
337 | * Gets whether keyphrase frequency attribute is used.
|
---|
338 | */
|
---|
339 | public boolean getKFused() {
|
---|
340 |
|
---|
341 | return m_KFused;
|
---|
342 | }
|
---|
343 |
|
---|
344 | /**
|
---|
345 | * Get whether the supplied columns are to be processed
|
---|
346 | *
|
---|
347 | * @return true if the supplied columns won't be processed
|
---|
348 | */
|
---|
349 | public boolean getDisallowInternalPeriods() {
|
---|
350 |
|
---|
351 | return m_DisallowInternalPeriods;
|
---|
352 | }
|
---|
353 |
|
---|
354 | /**
|
---|
355 | * Set whether selected columns should be processed. If true the
|
---|
356 | * selected columns won't be processed.
|
---|
357 | *
|
---|
358 | * @param invert the new invert setting
|
---|
359 | */
|
---|
360 | public void setDisallowInternalPeriods(boolean disallow) {
|
---|
361 |
|
---|
362 | m_DisallowInternalPeriods = disallow;
|
---|
363 | }
|
---|
364 |
|
---|
365 | /**
|
---|
366 | * Parses a given list of options controlling the behaviour of this object.
|
---|
367 | * Valid options are:<p>
|
---|
368 | *
|
---|
369 | * -K<br>
|
---|
370 | * Specifies whether keyphrase frequency statistic is used.<p>
|
---|
371 | *
|
---|
372 | * -M length<br>
|
---|
373 | * Sets the maximum phrase length (default: 3).<p>
|
---|
374 | *
|
---|
375 | * -L length<br>
|
---|
376 | * Sets the minimum phrase length (default: 1).<p>
|
---|
377 | *
|
---|
378 | * -D<br>
|
---|
379 | * Turns debugging mode on.<p>
|
---|
380 | *
|
---|
381 | * -I index<br>
|
---|
382 | * Sets the index of the attribute containing the documents (default: 0).<p>
|
---|
383 | *
|
---|
384 | * -J index<br>
|
---|
385 | * Sets the index of the attribute containing the keyphrases (default: 1).<p>
|
---|
386 | *
|
---|
387 | * -P<br>
|
---|
388 | * Disallow internal periods <p>
|
---|
389 | *
|
---|
390 | * -O number<br>
|
---|
391 | * The minimum number of times a phrase needs to occur (default: 2). <p>
|
---|
392 | *
|
---|
393 | * @param options the list of options as an array of strings
|
---|
394 | * @exception Exception if an option is not supported
|
---|
395 | */
|
---|
396 | public void setOptions(String[] options) throws Exception {
|
---|
397 |
|
---|
398 | setKFused(Utils.getFlag('K', options));
|
---|
399 | setDebug(Utils.getFlag('D', options));
|
---|
400 | String docAttIndexString = Utils.getOption('I', options);
|
---|
401 | if (docAttIndexString.length() > 0) {
|
---|
402 | setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
|
---|
403 | } else {
|
---|
404 | setDocumentAtt(0);
|
---|
405 | }
|
---|
406 | String keyphraseAttIndexString = Utils.getOption('J', options);
|
---|
407 | if (keyphraseAttIndexString.length() > 0) {
|
---|
408 | setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
|
---|
409 | } else {
|
---|
410 | setKeyphrasesAtt(1);
|
---|
411 | }
|
---|
412 | String maxPhraseLengthString = Utils.getOption('M', options);
|
---|
413 | if (maxPhraseLengthString.length() > 0) {
|
---|
414 | setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
|
---|
415 | } else {
|
---|
416 | setMaxPhraseLength(3);
|
---|
417 | }
|
---|
418 | String minPhraseLengthString = Utils.getOption('M', options);
|
---|
419 | if (minPhraseLengthString.length() > 0) {
|
---|
420 | setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
|
---|
421 | } else {
|
---|
422 | setMinPhraseLength(1);
|
---|
423 | }
|
---|
424 | String minNumOccurString = Utils.getOption('O', options);
|
---|
425 | if (minNumOccurString.length() > 0) {
|
---|
426 | setMinNumOccur(Integer.parseInt(minNumOccurString));
|
---|
427 | } else {
|
---|
428 | setMinNumOccur(2);
|
---|
429 | }
|
---|
430 | setDisallowInternalPeriods(Utils.getFlag('P', options));
|
---|
431 | }
|
---|
432 |
|
---|
433 | /**
|
---|
434 | * Gets the current settings of the filter.
|
---|
435 | *
|
---|
436 | * @return an array of strings suitable for passing to setOptions
|
---|
437 | */
|
---|
438 | public String [] getOptions() {
|
---|
439 |
|
---|
440 | String [] options = new String [13];
|
---|
441 | int current = 0;
|
---|
442 |
|
---|
443 | if (getKFused()) {
|
---|
444 | options[current++] = "-K";
|
---|
445 | }
|
---|
446 | if (getDebug()) {
|
---|
447 | options[current++] = "-D";
|
---|
448 | }
|
---|
449 | options[current++] = "-I";
|
---|
450 | options[current++] = "" + (getDocumentAtt() + 1);
|
---|
451 | options[current++] = "-J";
|
---|
452 | options[current++] = "" + (getKeyphrasesAtt() + 1);
|
---|
453 | options[current++] = "-M";
|
---|
454 | options[current++] = "" + (getMaxPhraseLength());
|
---|
455 | options[current++] = "-L";
|
---|
456 | options[current++] = "" + (getMinPhraseLength());
|
---|
457 | options[current++] = "-O";
|
---|
458 | options[current++] = "" + (getMinNumOccur());
|
---|
459 |
|
---|
460 | if (getDisallowInternalPeriods()) {
|
---|
461 | options[current++] = "-P";
|
---|
462 | }
|
---|
463 |
|
---|
464 | while (current < options.length) {
|
---|
465 | options[current++] = "";
|
---|
466 | }
|
---|
467 | return options;
|
---|
468 | }
|
---|
469 |
|
---|
470 | /**
|
---|
471 | * Returns an enumeration describing the available options
|
---|
472 | *
|
---|
473 | * @return an enumeration of all the available options
|
---|
474 | */
|
---|
475 | public Enumeration listOptions() {
|
---|
476 |
|
---|
477 | Vector newVector = new Vector(7);
|
---|
478 |
|
---|
479 | newVector.addElement(new Option(
|
---|
480 | "\tSpecifies whether keyphrase frequency statistic is used.",
|
---|
481 | "K", 0, "-K"));
|
---|
482 | newVector.addElement(new Option(
|
---|
483 | "\tSets the maximum phrase length (default: 3).",
|
---|
484 | "M", 1, "-M <length>"));
|
---|
485 | newVector.addElement(new Option(
|
---|
486 | "\tSets the minimum phrase length (default: 1).",
|
---|
487 | "L", 1, "-L <length>"));
|
---|
488 | newVector.addElement(new Option(
|
---|
489 | "\tTurns debugging mode on.",
|
---|
490 | "D", 0, "-D"));
|
---|
491 | newVector.addElement(new Option(
|
---|
492 | "\tSets the index of the document attribute (default: 0).",
|
---|
493 | "I", 1, "-I"));
|
---|
494 | newVector.addElement(new Option(
|
---|
495 | "\tSets the index of the keyphrase attribute (default: 1).",
|
---|
496 | "J", 1, "-J"));
|
---|
497 | newVector.addElement(new Option(
|
---|
498 | "\tDisallow internal periods.",
|
---|
499 | "P", 0, "-P"));
|
---|
500 | newVector.addElement(new Option(
|
---|
501 | "\tSet the minimum number of occurences (default: 2).",
|
---|
502 | "O", 1, "-O"));
|
---|
503 |
|
---|
504 | return newVector.elements();
|
---|
505 | }
|
---|
506 |
|
---|
507 | /**
|
---|
508 | * Returns a string describing this filter
|
---|
509 | *
|
---|
510 | * @return a description of the filter suitable for
|
---|
511 | * displaying in the explorer/experimenter gui
|
---|
512 | */
|
---|
513 | public String globalInfo() {
|
---|
514 | return "Converts incoming data into data appropriate for " +
|
---|
515 | "keyphrase classification.";
|
---|
516 | }
|
---|
517 |
|
---|
518 | /**
|
---|
519 | * Sets the format of the input instances.
|
---|
520 | *
|
---|
521 | * @param instanceInfo an Instances object containing the input
|
---|
522 | * instance structure (any instances contained in the object are
|
---|
523 | * ignored - only the structure is required).
|
---|
524 | * @return true if the outputFormat may be collected immediately
|
---|
525 | */
|
---|
526 | public boolean setInputFormat(Instances instanceInfo) throws Exception {
|
---|
527 |
|
---|
528 | if (instanceInfo.classIndex() >= 0) {
|
---|
529 | throw new Exception("Don't know what do to if class index set!");
|
---|
530 | }
|
---|
531 | if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() ||
|
---|
532 | !instanceInfo.attribute(m_DocumentAtt).isString()) {
|
---|
533 | throw new Exception("Keyphrase attribute and document attribute " +
|
---|
534 | "need to be string attributes.");
|
---|
535 | }
|
---|
536 | m_PunctFilter = new KEAPhraseFilter();
|
---|
537 | int[] arr = new int[1];
|
---|
538 | arr[0] = m_DocumentAtt;
|
---|
539 | m_PunctFilter.setAttributeIndicesArray(arr);
|
---|
540 | m_PunctFilter.setInputFormat(instanceInfo);
|
---|
541 | m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
|
---|
542 | m_NumbersFilter = new NumbersFilter();
|
---|
543 | m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
|
---|
544 | super.setInputFormat(m_NumbersFilter.getOutputFormat());
|
---|
545 | return false;
|
---|
546 | }
|
---|
547 |
|
---|
548 | /**
|
---|
549 | * Input an instance for filtering. Ordinarily the instance is processed
|
---|
550 | * and made available for output immediately. Some filters require all
|
---|
551 | * instances be read before producing output.
|
---|
552 | *
|
---|
553 | * @param instance the input instance
|
---|
554 | * @return true if the filtered instance may now be
|
---|
555 | * collected with output().
|
---|
556 | * @exception Exception if the input instance was not of the correct
|
---|
557 | * format or if there was a problem with the filtering.
|
---|
558 | */
|
---|
559 | public boolean input(Instance instance) throws Exception {
|
---|
560 |
|
---|
561 | if (getInputFormat() == null) {
|
---|
562 | throw new Exception("No input instance format defined");
|
---|
563 | }
|
---|
564 | if (m_NewBatch) {
|
---|
565 | resetQueue();
|
---|
566 | m_NewBatch = false;
|
---|
567 | }
|
---|
568 |
|
---|
569 | if (m_Debug) {
|
---|
570 | System.err.println("-- Reading instance");
|
---|
571 | }
|
---|
572 |
|
---|
573 | m_PunctFilter.input(instance);
|
---|
574 | m_PunctFilter.batchFinished();
|
---|
575 | instance = m_PunctFilter.output();
|
---|
576 |
|
---|
577 | m_NumbersFilter.input(instance);
|
---|
578 | m_NumbersFilter.batchFinished();
|
---|
579 | instance = m_NumbersFilter.output();
|
---|
580 |
|
---|
581 | if (m_Dictionary == null) {
|
---|
582 | bufferInput(instance);
|
---|
583 | return false;
|
---|
584 | } else {
|
---|
585 | FastVector vector = convertInstance(instance, false);
|
---|
586 | Enumeration enum = vector.elements();
|
---|
587 | while (enum.hasMoreElements()) {
|
---|
588 | Instance inst = (Instance)enum.nextElement();
|
---|
589 | push(inst);
|
---|
590 | }
|
---|
591 | return true;
|
---|
592 | }
|
---|
593 | }
|
---|
594 |
|
---|
595 | /**
|
---|
596 | * Signify that this batch of input to the filter is finished.
|
---|
597 | * If the filter requires all instances prior to filtering,
|
---|
598 | * output() may now be called to retrieve the filtered instances.
|
---|
599 | *
|
---|
600 | * @return true if there are instances pending output
|
---|
601 | * @exception Exception if no input structure has been defined
|
---|
602 | */
|
---|
603 | public boolean batchFinished() throws Exception {
|
---|
604 |
|
---|
605 | if (getInputFormat() == null) {
|
---|
606 | throw new Exception("No input instance format defined");
|
---|
607 | }
|
---|
608 | if (m_Dictionary == null) {
|
---|
609 | buildGlobalDictionaries();
|
---|
610 | buildClassifier();
|
---|
611 | convertPendingInstances();
|
---|
612 | }
|
---|
613 | flushInput();
|
---|
614 | m_NewBatch = true;
|
---|
615 | return (numPendingOutput() != 0);
|
---|
616 | }
|
---|
617 |
|
---|
618 | /**
|
---|
619 | * Builds the global dictionaries.
|
---|
620 | */
|
---|
621 | private void buildGlobalDictionaries() throws Exception {
|
---|
622 |
|
---|
623 | if (m_Debug) {
|
---|
624 | System.err.println("--- Building global dictionaries");
|
---|
625 | }
|
---|
626 |
|
---|
627 | // Build dictionary of n-grams with associated
|
---|
628 | // document frequencies
|
---|
629 | m_Dictionary = new HashMap();
|
---|
630 | for (int i = 0; i < getInputFormat().numInstances(); i++) {
|
---|
631 | String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
|
---|
632 | HashMap hash = getPhrasesForDictionary(str);
|
---|
633 | Iterator it = hash.keySet().iterator();
|
---|
634 | while (it.hasNext()) {
|
---|
635 | String phrase = (String)it.next();
|
---|
636 | Counter counter = (Counter)m_Dictionary.get(phrase);
|
---|
637 | if (counter == null) {
|
---|
638 | m_Dictionary.put(phrase, new Counter());
|
---|
639 | } else {
|
---|
640 | counter.increment();
|
---|
641 | }
|
---|
642 | }
|
---|
643 | }
|
---|
644 |
|
---|
645 | if (m_KFused) {
|
---|
646 |
|
---|
647 | // Build dictionary of n-grams that occur as keyphrases
|
---|
648 | // with associated keyphrase frequencies
|
---|
649 | m_KeyphraseDictionary = new HashMap();
|
---|
650 | for (int i = 0; i < getInputFormat().numInstances(); i++) {
|
---|
651 | String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
|
---|
652 | HashMap hash = getGivenKeyphrases(str, false);
|
---|
653 | if (hash != null) {
|
---|
654 | Iterator it = hash.keySet().iterator();
|
---|
655 | while (it.hasNext()) {
|
---|
656 | String phrase = (String)it.next();
|
---|
657 | Counter counter = (Counter)m_KeyphraseDictionary.get(phrase);
|
---|
658 | if (counter == null) {
|
---|
659 | m_KeyphraseDictionary.put(phrase, new Counter());
|
---|
660 | } else {
|
---|
661 | counter.increment();
|
---|
662 | }
|
---|
663 | }
|
---|
664 | }
|
---|
665 | }
|
---|
666 | } else {
|
---|
667 | m_KeyphraseDictionary = null;
|
---|
668 | }
|
---|
669 |
|
---|
670 | // Set the number of documents in the global corpus
|
---|
671 | m_NumDocs = getInputFormat().numInstances();
|
---|
672 | }
|
---|
673 |
|
---|
674 | /**
|
---|
675 | * Builds the classifier.
|
---|
676 | */
|
---|
677 | private void buildClassifier() throws Exception {
|
---|
678 |
|
---|
679 | // Generate input format for classifier
|
---|
680 | FastVector atts = new FastVector();
|
---|
681 | for (int i = 0; i < getInputFormat().numAttributes(); i++) {
|
---|
682 | if (i == m_DocumentAtt) {
|
---|
683 | atts.addElement(new Attribute("TFxIDF"));
|
---|
684 | atts.addElement(new Attribute("First_occurrence"));
|
---|
685 | if (m_KFused) {
|
---|
686 | atts.addElement(new Attribute("Keyphrase_frequency"));
|
---|
687 | }
|
---|
688 | } else if (i == m_KeyphrasesAtt) {
|
---|
689 | FastVector vals = new FastVector(2);
|
---|
690 | vals.addElement("False");
|
---|
691 | vals.addElement("True");
|
---|
692 | atts.addElement(new Attribute("Keyphrase?", vals));
|
---|
693 | }
|
---|
694 | }
|
---|
695 | m_ClassifierData = new Instances("ClassifierData", atts, 0);
|
---|
696 | m_ClassifierData.setClassIndex(m_NumFeatures);
|
---|
697 |
|
---|
698 | if (m_Debug) {
|
---|
699 | System.err.println("--- Converting instances for classifier");
|
---|
700 | }
|
---|
701 |
|
---|
702 | // Convert pending input instances into data for classifier
|
---|
703 | for(int i = 0; i < getInputFormat().numInstances(); i++) {
|
---|
704 | Instance current = getInputFormat().instance(i);
|
---|
705 |
|
---|
706 | // Get the key phrases for the document
|
---|
707 | String keyphrases = current.stringValue(m_KeyphrasesAtt);
|
---|
708 | HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
|
---|
709 | HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);
|
---|
710 |
|
---|
711 | // Get the phrases for the document
|
---|
712 | HashMap hash = new HashMap();
|
---|
713 | int length = getPhrases(hash, current.stringValue(m_DocumentAtt));
|
---|
714 |
|
---|
715 | // Compute the feature values for each phrase and
|
---|
716 | // add the instance to the data for the classifier
|
---|
717 | Iterator it = hash.keySet().iterator();
|
---|
718 | while (it.hasNext()) {
|
---|
719 | String phrase = (String)it.next();
|
---|
720 | FastVector phraseInfo = (FastVector)hash.get(phrase);
|
---|
721 | double[] vals = featVals(phrase, phraseInfo, true,
|
---|
722 | hashKeysEval, hashKeyphrases, length);
|
---|
723 | Instance inst = new Instance(current.weight(), vals);
|
---|
724 | m_ClassifierData.add(inst);
|
---|
725 | }
|
---|
726 | }
|
---|
727 |
|
---|
728 | if (m_Debug) {
|
---|
729 | System.err.println("--- Building classifier");
|
---|
730 | }
|
---|
731 |
|
---|
732 | // Build classifier
|
---|
733 | FilteredClassifier fclass = new FilteredClassifier();
|
---|
734 | fclass.setClassifier(new weka.classifiers.NaiveBayesSimple());
|
---|
735 | fclass.setFilter(new DiscretizeFilter());
|
---|
736 | m_Classifier = fclass;
|
---|
737 | m_Classifier.buildClassifier(m_ClassifierData);
|
---|
738 |
|
---|
739 | if (m_Debug) {
|
---|
740 | System.err.println(m_Classifier);
|
---|
741 | }
|
---|
742 |
|
---|
743 | // Save space
|
---|
744 | m_ClassifierData = new Instances(m_ClassifierData, 0);
|
---|
745 | }
|
---|
746 |
|
---|
747 | /**
|
---|
748 | * Conmputes the feature values for a given phrase.
|
---|
749 | */
|
---|
750 | private double[] featVals(String phrase, FastVector phraseInfo,
|
---|
751 | boolean training, HashMap hashKeysEval,
|
---|
752 | HashMap hashKeyphrases, int length) {
|
---|
753 |
|
---|
754 | // Compute feature values
|
---|
755 | Counter counterLocal = (Counter)phraseInfo.elementAt(1);
|
---|
756 | double[] newInst = new double[m_NumFeatures + 1];
|
---|
757 |
|
---|
758 | // Compute TFxIDF
|
---|
759 | Counter counterGlobal = (Counter)m_Dictionary.get(phrase);
|
---|
760 | double localVal = counterLocal.value(), globalVal = 0;
|
---|
761 | if (counterGlobal != null) {
|
---|
762 | globalVal = counterGlobal.value();
|
---|
763 | if (training) {
|
---|
764 | globalVal = globalVal - 1;
|
---|
765 | }
|
---|
766 | }
|
---|
767 |
|
---|
768 | // Just devide by length to get approximation of probability
|
---|
769 | // that phrase in document is our phrase
|
---|
770 | newInst[m_TfidfIndex] = (localVal / ((double)length)) *
|
---|
771 | (-Math.log((globalVal + 1)/ ((double)m_NumDocs + 1)));
|
---|
772 |
|
---|
773 | // Compute first occurrence
|
---|
774 | Counter counterFirst = (Counter)phraseInfo.elementAt(0);
|
---|
775 | newInst[m_FirstOccurIndex] = (double)counterFirst.value() /
|
---|
776 | (double)length;
|
---|
777 |
|
---|
778 | // Is keyphrase frequency attribute being used?
|
---|
779 | if (m_KFused) {
|
---|
780 | Counter keyphraseC = (Counter)m_KeyphraseDictionary.get(phrase);
|
---|
781 | if ((training) && (hashKeyphrases != null) &&
|
---|
782 | (hashKeyphrases.containsKey(phrase))) {
|
---|
783 | newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
|
---|
784 | } else {
|
---|
785 | if (keyphraseC != null) {
|
---|
786 | newInst[m_KeyFreqIndex] = keyphraseC.value();
|
---|
787 | } else {
|
---|
788 | newInst[m_KeyFreqIndex] = 0;
|
---|
789 | }
|
---|
790 | }
|
---|
791 | }
|
---|
792 |
|
---|
793 | // Compute class value
|
---|
794 | String phraseInEvalFormat = evalFormat((String)phraseInfo.elementAt(2));
|
---|
795 | if (hashKeysEval == null) { // no author-assigned keyphrases
|
---|
796 | newInst[m_NumFeatures] = Instance.missingValue();
|
---|
797 | } else if (!hashKeysEval.containsKey(phraseInEvalFormat)) {
|
---|
798 | newInst[m_NumFeatures] = 0; // No keyphrase
|
---|
799 | } else {
|
---|
800 | hashKeysEval.remove(phraseInEvalFormat);
|
---|
801 | newInst[m_NumFeatures] = 1; // Keyphrase
|
---|
802 | }
|
---|
803 | return newInst;
|
---|
804 | }
|
---|
805 |
|
---|
806 | /**
|
---|
807 | * Sets output format and converts pending input instances.
|
---|
808 | */
|
---|
809 | private void convertPendingInstances() throws Exception {
|
---|
810 |
|
---|
811 | if (m_Debug) {
|
---|
812 | System.err.println("--- Converting pending instances");
|
---|
813 | }
|
---|
814 |
|
---|
815 | // Create output format for filter
|
---|
816 | FastVector atts = new FastVector();
|
---|
817 | for (int i = 0; i < getInputFormat().numAttributes(); i++) {
|
---|
818 | if (i == m_DocumentAtt) {
|
---|
819 | atts.addElement(new Attribute("N-gram", null));
|
---|
820 | atts.addElement(new Attribute("N-gram-original", null));
|
---|
821 | atts.addElement(new Attribute("TFxIDF"));
|
---|
822 | atts.addElement(new Attribute("First_occurrence"));
|
---|
823 | if (m_Debug) {
|
---|
824 | if (m_KFused) {
|
---|
825 | atts.addElement(new Attribute("Keyphrase_frequency"));
|
---|
826 | }
|
---|
827 | }
|
---|
828 | atts.addElement(new Attribute("Probability"));
|
---|
829 | atts.addElement(new Attribute("Rank"));
|
---|
830 | } else if (i == m_KeyphrasesAtt) {
|
---|
831 | FastVector vals = new FastVector(2);
|
---|
832 | vals.addElement("False");
|
---|
833 | vals.addElement("True");
|
---|
834 | atts.addElement(new Attribute("Keyphrase?", vals));
|
---|
835 | } else {
|
---|
836 | atts.addElement(getInputFormat().attribute(i));
|
---|
837 | }
|
---|
838 | }
|
---|
839 | Instances outFormat = new Instances("KEAdata", atts, 0);
|
---|
840 | setOutputFormat(outFormat);
|
---|
841 |
|
---|
842 | // Convert pending input instances into output data
|
---|
843 | for(int i = 0; i < getInputFormat().numInstances(); i++) {
|
---|
844 | Instance current = getInputFormat().instance(i);
|
---|
845 | FastVector vector = convertInstance(current, true);
|
---|
846 | Enumeration enum = vector.elements();
|
---|
847 | while (enum.hasMoreElements()) {
|
---|
848 | Instance inst = (Instance)enum.nextElement();
|
---|
849 | push(inst);
|
---|
850 | }
|
---|
851 | }
|
---|
852 | }
|
---|
853 |
|
---|
854 | /**
|
---|
855 | * Converts an instance.
|
---|
856 | */
|
---|
857 | private FastVector convertInstance(Instance instance, boolean training)
|
---|
858 | throws Exception {
|
---|
859 |
|
---|
860 | FastVector vector = new FastVector();
|
---|
861 |
|
---|
862 | if (m_Debug) {
|
---|
863 | System.err.println("-- Converting instance");
|
---|
864 | }
|
---|
865 |
|
---|
866 | // Get the key phrases for the document
|
---|
867 | HashMap hashKeyphrases = null;
|
---|
868 | HashMap hashKeysEval = null;
|
---|
869 | if (!instance.isMissing(m_KeyphrasesAtt)) {
|
---|
870 | String keyphrases = instance.stringValue(m_KeyphrasesAtt);
|
---|
871 | hashKeyphrases = getGivenKeyphrases(keyphrases, false);
|
---|
872 | hashKeysEval = getGivenKeyphrases(keyphrases, true);
|
---|
873 | }
|
---|
874 |
|
---|
875 | // Get the phrases for the document
|
---|
876 | HashMap hash = new HashMap();
|
---|
877 | int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
|
---|
878 |
|
---|
879 | // Compute number of extra attributes
|
---|
880 | int numFeatures = 5;
|
---|
881 | if (m_Debug) {
|
---|
882 | if (m_KFused) {
|
---|
883 | numFeatures = numFeatures + 1;
|
---|
884 | }
|
---|
885 | }
|
---|
886 |
|
---|
887 | // Set indices of key attributes
|
---|
888 | int phraseAttIndex = m_DocumentAtt;
|
---|
889 | int tfidfAttIndex = m_DocumentAtt + 2;
|
---|
890 | int distAttIndex = m_DocumentAtt + 3;
|
---|
891 | int probsAttIndex = m_DocumentAtt + numFeatures - 1;
|
---|
892 |
|
---|
893 | // Go through the phrases and convert them into instances
|
---|
894 | Iterator it = hash.keySet().iterator();
|
---|
895 | while (it.hasNext()) {
|
---|
896 | String phrase = (String)it.next();
|
---|
897 | FastVector phraseInfo = (FastVector)hash.get(phrase);
|
---|
898 | double[] vals = featVals(phrase, phraseInfo, training,
|
---|
899 | hashKeysEval, hashKeyphrases, length);
|
---|
900 | Instance inst = new Instance(instance.weight(), vals);
|
---|
901 | inst.setDataset(m_ClassifierData);
|
---|
902 |
|
---|
903 | // Get probability of phrase being key phrase
|
---|
904 | double[] probs = m_Classifier.distributionForInstance(inst);
|
---|
905 | double prob = probs[1];
|
---|
906 |
|
---|
907 | // Compute attribute values for final instance
|
---|
908 | double[] newInst =
|
---|
909 | new double[instance.numAttributes() + numFeatures];
|
---|
910 | int pos = 0;
|
---|
911 | for (int i = 0; i < instance.numAttributes(); i++) {
|
---|
912 | if (i == m_DocumentAtt) {
|
---|
913 |
|
---|
914 | // Add phrase
|
---|
915 | int index = outputFormatPeek().attribute(pos).
|
---|
916 | addStringValue(phrase);
|
---|
917 | newInst[pos++] = index;
|
---|
918 |
|
---|
919 | // Add original version
|
---|
920 | index = outputFormatPeek().attribute(pos).
|
---|
921 | addStringValue((String)phraseInfo.elementAt(2));
|
---|
922 | newInst[pos++] = index;
|
---|
923 |
|
---|
924 | // Add TFxIDF
|
---|
925 | newInst[pos++] = inst.value(m_TfidfIndex);
|
---|
926 |
|
---|
927 | // Add distance
|
---|
928 | newInst[pos++] = inst.value(m_FirstOccurIndex);
|
---|
929 |
|
---|
930 | // Add other features
|
---|
931 | if (m_Debug) {
|
---|
932 | if (m_KFused) {
|
---|
933 | newInst[pos++] = inst.value(m_KeyFreqIndex);
|
---|
934 | }
|
---|
935 | }
|
---|
936 |
|
---|
937 | // Add probability
|
---|
938 | probsAttIndex = pos;
|
---|
939 | newInst[pos++] = prob;
|
---|
940 |
|
---|
941 | // Set rank to missing (computed below)
|
---|
942 | newInst[pos++] = Instance.missingValue();
|
---|
943 | } else if (i == m_KeyphrasesAtt) {
|
---|
944 | newInst[pos++] = inst.classValue();
|
---|
945 | } else {
|
---|
946 | newInst[pos++] = instance.value(i);
|
---|
947 | }
|
---|
948 | }
|
---|
949 | Instance ins = new Instance(instance.weight(), newInst);
|
---|
950 | ins.setDataset(outputFormatPeek());
|
---|
951 | vector.addElement(ins);
|
---|
952 | }
|
---|
953 |
|
---|
954 | // Add dummy instances for keyphrases that don't occur
|
---|
955 | // in the document
|
---|
956 | if (hashKeysEval != null) {
|
---|
957 | Iterator phrases = hashKeysEval.keySet().iterator();
|
---|
958 | while (phrases.hasNext()) {
|
---|
959 | String phrase = (String)phrases.next();
|
---|
960 | double[] newInst =
|
---|
961 | new double[instance.numAttributes() + numFeatures];
|
---|
962 | int pos = 0;
|
---|
963 | for (int i = 0; i < instance.numAttributes(); i++) {
|
---|
964 | if (i == m_DocumentAtt) {
|
---|
965 |
|
---|
966 | // Add phrase
|
---|
967 | int index = outputFormatPeek().attribute(pos).
|
---|
968 | addStringValue(phrase);
|
---|
969 | newInst[pos++] = (double)index;
|
---|
970 |
|
---|
971 | // Add original version
|
---|
972 | index = outputFormatPeek().attribute(pos).
|
---|
973 | addStringValue((String)hashKeysEval.get(phrase));
|
---|
974 | newInst[pos++] = (double)index;
|
---|
975 |
|
---|
976 | // Add TFxIDF
|
---|
977 | newInst[pos++] = Instance.missingValue();
|
---|
978 |
|
---|
979 | // Add distance
|
---|
980 | newInst[pos++] = Instance.missingValue();
|
---|
981 |
|
---|
982 | // Add other features
|
---|
983 | if (m_Debug) {
|
---|
984 | if (m_KFused) {
|
---|
985 | newInst[pos++] = Instance.missingValue();
|
---|
986 | }
|
---|
987 | }
|
---|
988 |
|
---|
989 | // Add probability and rank
|
---|
990 | newInst[pos++] = -Double.MAX_VALUE;
|
---|
991 | newInst[pos++] = Instance.missingValue();
|
---|
992 | } else if (i == m_KeyphrasesAtt) {
|
---|
993 | newInst[pos++] = 1; // Keyphrase
|
---|
994 | } else {
|
---|
995 | newInst[pos++] = instance.value(i);
|
---|
996 | }
|
---|
997 | }
|
---|
998 | Instance inst = new Instance(instance.weight(), newInst);
|
---|
999 | inst.setDataset(outputFormatPeek());
|
---|
1000 | vector.addElement(inst);
|
---|
1001 | }
|
---|
1002 | }
|
---|
1003 |
|
---|
1004 | // Sort phrases according to their distance (stable sort)
|
---|
1005 | double[] vals = new double[vector.size()];
|
---|
1006 | for (int i = 0; i < vals.length; i++) {
|
---|
1007 | vals[i] = ((Instance)vector.elementAt(i)).value(distAttIndex);
|
---|
1008 | }
|
---|
1009 | FastVector newVector = new FastVector(vector.size());
|
---|
1010 | int[] sortedIndices = Utils.stableSort(vals);
|
---|
1011 | for (int i = 0; i < vals.length; i++) {
|
---|
1012 | newVector.addElement(vector.elementAt(sortedIndices[i]));
|
---|
1013 | }
|
---|
1014 | vector = newVector;
|
---|
1015 |
|
---|
1016 | // Sort phrases according to their tfxidf value (stable sort)
|
---|
1017 | for (int i = 0; i < vals.length; i++) {
|
---|
1018 | vals[i] = -((Instance)vector.elementAt(i)).value(tfidfAttIndex);
|
---|
1019 | }
|
---|
1020 | newVector = new FastVector(vector.size());
|
---|
1021 | sortedIndices = Utils.stableSort(vals);
|
---|
1022 | for (int i = 0; i < vals.length; i++) {
|
---|
1023 | newVector.addElement(vector.elementAt(sortedIndices[i]));
|
---|
1024 | }
|
---|
1025 | vector = newVector;
|
---|
1026 |
|
---|
1027 | // Sort phrases according to their probability (stable sort)
|
---|
1028 | for (int i = 0; i < vals.length; i++) {
|
---|
1029 | vals[i] = 1 - ((Instance)vector.elementAt(i)).value(probsAttIndex);
|
---|
1030 | }
|
---|
1031 | newVector = new FastVector(vector.size());
|
---|
1032 | sortedIndices = Utils.stableSort(vals);
|
---|
1033 | for (int i = 0; i < vals.length; i++) {
|
---|
1034 | newVector.addElement(vector.elementAt(sortedIndices[i]));
|
---|
1035 | }
|
---|
1036 | vector = newVector;
|
---|
1037 |
|
---|
1038 | // Compute rank of phrases. Check for subphrases that are ranked
|
---|
1039 | // lower than superphrases and assign probability -1 and set the
|
---|
1040 | // rank to Integer.MAX_VALUE
|
---|
1041 | int rank = 1;
|
---|
1042 | for (int i = 0; i < vals.length; i++) {
|
---|
1043 | Instance currentInstance = (Instance)vector.elementAt(i);
|
---|
1044 |
|
---|
1045 | // Short cut: if phrase very unlikely make rank very low and continue
|
---|
1046 | if (Utils.grOrEq(vals[i], 1.0)) {
|
---|
1047 | currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
|
---|
1048 | continue;
|
---|
1049 | }
|
---|
1050 |
|
---|
1051 | // Otherwise look for super phrase starting with first phrase
|
---|
1052 | // in list that has same probability, TFxIDF value, and distance as
|
---|
1053 | // current phrase. We do this to catch all superphrases
|
---|
1054 | // that have same probability, TFxIDF value and distance as current phrase.
|
---|
1055 | int startInd = i;
|
---|
1056 | while (startInd < vals.length) {
|
---|
1057 | Instance inst = (Instance)vector.elementAt(startInd);
|
---|
1058 | if ((inst.value(tfidfAttIndex) !=
|
---|
1059 | currentInstance.value(tfidfAttIndex)) ||
|
---|
1060 | (inst.value(probsAttIndex) !=
|
---|
1061 | currentInstance.value(probsAttIndex)) ||
|
---|
1062 | (inst.value(distAttIndex) !=
|
---|
1063 | currentInstance.value(distAttIndex))) {
|
---|
1064 | break;
|
---|
1065 | }
|
---|
1066 | startInd++;
|
---|
1067 | }
|
---|
1068 | String val = currentInstance.stringValue(phraseAttIndex);
|
---|
1069 | boolean foundSuperphrase = false;
|
---|
1070 | for (int j = startInd - 1; j >= 0; j--) {
|
---|
1071 | if (j != i) {
|
---|
1072 | Instance candidate = (Instance)vector.elementAt(j);
|
---|
1073 | String potSuperphrase = candidate.stringValue(phraseAttIndex);
|
---|
1074 | if (val.length() <= potSuperphrase.length()) {
|
---|
1075 | if (KEAFilter.contains(val, potSuperphrase)) {
|
---|
1076 | foundSuperphrase = true;
|
---|
1077 | break;
|
---|
1078 | }
|
---|
1079 | }
|
---|
1080 | }
|
---|
1081 | }
|
---|
1082 | if (foundSuperphrase) {
|
---|
1083 | currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
|
---|
1084 | } else {
|
---|
1085 | currentInstance.setValue(probsAttIndex + 1, rank++);
|
---|
1086 | }
|
---|
1087 | }
|
---|
1088 | return vector;
|
---|
1089 | }
|
---|
1090 |
|
---|
1091 | /**
|
---|
1092 | * Checks whether one phrase is a subphrase of another phrase.
|
---|
1093 | */
|
---|
1094 | private static boolean contains(String sub, String sup) {
|
---|
1095 |
|
---|
1096 | int i = 0;
|
---|
1097 | while (i + sub.length() - 1 < sup.length()) {
|
---|
1098 | int j;
|
---|
1099 | for (j = 0; j < sub.length(); j++) {
|
---|
1100 | if (sub.charAt(j) != sup.charAt(i + j)) {
|
---|
1101 | break;
|
---|
1102 | }
|
---|
1103 | }
|
---|
1104 | if (j == sub.length()) {
|
---|
1105 | if ((i + j) < sup.length()) {
|
---|
1106 | if (sup.charAt(i + j) == ' ') {
|
---|
1107 | return true;
|
---|
1108 | } else {
|
---|
1109 | return false;
|
---|
1110 | }
|
---|
1111 | } else {
|
---|
1112 | return true;
|
---|
1113 | }
|
---|
1114 | }
|
---|
1115 |
|
---|
1116 | // Skip forward to next space
|
---|
1117 | do {
|
---|
1118 | i++;
|
---|
1119 | } while ((i < sup.length()) && (sup.charAt(i) != ' '));
|
---|
1120 | i++;
|
---|
1121 | }
|
---|
1122 | return false;
|
---|
1123 | }
|
---|
1124 |
|
---|
1125 | /**
|
---|
1126 | * Returns a hashtable. Fills the hashtable
|
---|
1127 | * with the stemmed n-grams occuring in the given string
|
---|
1128 | * (as keys) and the number of times it occurs.
|
---|
1129 | */
|
---|
1130 | private HashMap getPhrasesForDictionary(String str) {
|
---|
1131 |
|
---|
1132 | String[] buffer = new String[m_MaxPhraseLength];
|
---|
1133 | HashMap hash = new HashMap();
|
---|
1134 |
|
---|
1135 | StringTokenizer tok = new StringTokenizer(str, "\n");
|
---|
1136 | while (tok.hasMoreTokens()) {
|
---|
1137 | String phrase = tok.nextToken();
|
---|
1138 | int numSeen = 0;
|
---|
1139 | StringTokenizer wordTok = new StringTokenizer(phrase, " ");
|
---|
1140 | while (wordTok.hasMoreTokens()) {
|
---|
1141 | String word = wordTok.nextToken();
|
---|
1142 |
|
---|
1143 | // Store word in buffer
|
---|
1144 | for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
|
---|
1145 | buffer[i] = buffer[i + 1];
|
---|
1146 | }
|
---|
1147 | buffer[m_MaxPhraseLength - 1] = word;
|
---|
1148 |
|
---|
1149 | // How many are buffered?
|
---|
1150 | numSeen++;
|
---|
1151 | if (numSeen > m_MaxPhraseLength) {
|
---|
1152 | numSeen = m_MaxPhraseLength;
|
---|
1153 | }
|
---|
1154 |
|
---|
1155 | // Don't consider phrases that end with a stop word
|
---|
1156 | if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
|
---|
1157 | continue;
|
---|
1158 | }
|
---|
1159 |
|
---|
1160 | // Loop through buffer and add phrases to hashtable
|
---|
1161 | StringBuffer phraseBuffer = new StringBuffer();
|
---|
1162 | for (int i = 1; i <= numSeen; i++) {
|
---|
1163 | if (i > 1) {
|
---|
1164 | phraseBuffer.insert(0, ' ');
|
---|
1165 | }
|
---|
1166 | phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
|
---|
1167 |
|
---|
1168 | // Don't consider phrases that begin with a stop word
|
---|
1169 | if ((i > 1) &&
|
---|
1170 | (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
|
---|
1171 | continue;
|
---|
1172 | }
|
---|
1173 |
|
---|
1174 | // Only consider phrases with minimum length
|
---|
1175 | if (i >= m_MinPhraseLength) {
|
---|
1176 |
|
---|
1177 | // Stem string
|
---|
1178 | String orig = phraseBuffer.toString();
|
---|
1179 | String internal = internalFormat(orig);
|
---|
1180 | Counter count = (Counter)hash.get(internal);
|
---|
1181 | if (count == null) {
|
---|
1182 | hash.put(internal, new Counter());
|
---|
1183 | } else {
|
---|
1184 | count.increment();
|
---|
1185 | }
|
---|
1186 |
|
---|
1187 | // Add components if phrase is single word before
|
---|
1188 | // conversion into internal format (i.e. error-correcting)
|
---|
1189 | /*if ((orig.indexOf(' ') == -1) &&
|
---|
1190 | (internal.indexOf(' ') != -1)) {
|
---|
1191 | StringTokenizer tokW = new StringTokenizer(internal, " ");
|
---|
1192 | while (tokW.hasMoreTokens()) {
|
---|
1193 | String comp = (String)tokW.nextToken();
|
---|
1194 | Counter countW = (Counter)hash.get(comp);
|
---|
1195 | if (countW == null) {
|
---|
1196 | hash.put(comp, new Counter());
|
---|
1197 | } else {
|
---|
1198 | countW.increment();
|
---|
1199 | }
|
---|
1200 | }
|
---|
1201 | }*/
|
---|
1202 | }
|
---|
1203 | }
|
---|
1204 | }
|
---|
1205 | }
|
---|
1206 | return hash;
|
---|
1207 | }
|
---|
1208 |
|
---|
1209 | /**
|
---|
1210 | * Expects an empty hashtable. Fills the hashtable
|
---|
1211 | * with the stemmed n-grams occuring in the given string
|
---|
1212 | * (as keys). Stores the position, the number of occurences,
|
---|
1213 | * and the most commonly occurring orgininal version of
|
---|
1214 | * each n-gram.
|
---|
1215 | *
|
---|
1216 | * N-grams that occur less than m_MinNumOccur are not used.
|
---|
1217 | *
|
---|
1218 | * Returns the total number of words (!) in the string.
|
---|
1219 | */
|
---|
1220 | private int getPhrases(HashMap hash, String str) {
|
---|
1221 |
|
---|
1222 | String[] buffer = new String[m_MaxPhraseLength];
|
---|
1223 |
|
---|
1224 | StringTokenizer tok = new StringTokenizer(str, "\n");
|
---|
1225 | int pos = 1;
|
---|
1226 | while (tok.hasMoreTokens()) {
|
---|
1227 | String phrase = tok.nextToken();
|
---|
1228 | int numSeen = 0;
|
---|
1229 | StringTokenizer wordTok = new StringTokenizer(phrase, " ");
|
---|
1230 | while (wordTok.hasMoreTokens()) {
|
---|
1231 | String word = wordTok.nextToken();
|
---|
1232 |
|
---|
1233 | // Store word in buffer
|
---|
1234 | for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
|
---|
1235 | buffer[i] = buffer[i + 1];
|
---|
1236 | }
|
---|
1237 | buffer[m_MaxPhraseLength - 1] = word;
|
---|
1238 |
|
---|
1239 | // How many are buffered?
|
---|
1240 | numSeen++;
|
---|
1241 | if (numSeen > m_MaxPhraseLength) {
|
---|
1242 | numSeen = m_MaxPhraseLength;
|
---|
1243 | }
|
---|
1244 |
|
---|
1245 | // Don't consider phrases that end with a stop word
|
---|
1246 | if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
|
---|
1247 | pos++;
|
---|
1248 | continue;
|
---|
1249 | }
|
---|
1250 |
|
---|
1251 | // Loop through buffer and add phrases to hashtable
|
---|
1252 | StringBuffer phraseBuffer = new StringBuffer();
|
---|
1253 | for (int i = 1; i <= numSeen; i++) {
|
---|
1254 | if (i > 1) {
|
---|
1255 | phraseBuffer.insert(0, ' ');
|
---|
1256 | }
|
---|
1257 | phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
|
---|
1258 |
|
---|
1259 | // Don't consider phrases that begin with a stop word
|
---|
1260 | if ((i > 1) &&
|
---|
1261 | (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
|
---|
1262 | continue;
|
---|
1263 | }
|
---|
1264 |
|
---|
1265 | // Only consider phrases with minimum length
|
---|
1266 | if (i >= m_MinPhraseLength) {
|
---|
1267 |
|
---|
1268 | // Stem string
|
---|
1269 | String phrStr = phraseBuffer.toString();
|
---|
1270 | String internal = internalFormat(phrStr);
|
---|
1271 | FastVector vec = (FastVector)hash.get(internal);
|
---|
1272 | if (vec == null) {
|
---|
1273 | vec = new FastVector(3);
|
---|
1274 |
|
---|
1275 | // HashMap for storing all versions
|
---|
1276 | HashMap secHash = new HashMap();
|
---|
1277 | secHash.put(phrStr, new Counter());
|
---|
1278 |
|
---|
1279 | // Update hashtable with all the info
|
---|
1280 | vec.addElement(new Counter(pos + 1 - i));
|
---|
1281 | vec.addElement(new Counter());
|
---|
1282 | vec.addElement(secHash);
|
---|
1283 | hash.put(internal, vec);
|
---|
1284 | } else {
|
---|
1285 |
|
---|
1286 | // Update number of occurrences
|
---|
1287 | ((Counter)((FastVector)vec).elementAt(1)).increment();
|
---|
1288 |
|
---|
1289 | // Update hashtable storing different versions
|
---|
1290 | HashMap secHash = (HashMap)vec.elementAt(2);
|
---|
1291 | Counter count = (Counter)secHash.get(phrStr);
|
---|
1292 | if (count == null) {
|
---|
1293 | secHash.put(phrStr, new Counter());
|
---|
1294 | } else {
|
---|
1295 | count.increment();
|
---|
1296 | }
|
---|
1297 | }
|
---|
1298 | }
|
---|
1299 | }
|
---|
1300 | pos++;
|
---|
1301 | }
|
---|
1302 | }
|
---|
1303 |
|
---|
1304 | // Replace secondary hashtables with most commonly occurring
|
---|
1305 | // version of each phrase (canonical) form. Delete all words
|
---|
1306 | // that are proper nouns.
|
---|
1307 | Iterator phrases = hash.keySet().iterator();
|
---|
1308 | while (phrases.hasNext()) {
|
---|
1309 | String phrase = (String)phrases.next();
|
---|
1310 | FastVector info = (FastVector)hash.get(phrase);
|
---|
1311 |
|
---|
1312 | // Occurring less than m_MinNumOccur?
|
---|
1313 | if (((Counter)((FastVector)info).elementAt(1)).value() < m_MinNumOccur) {
|
---|
1314 | phrases.remove();
|
---|
1315 | continue;
|
---|
1316 | }
|
---|
1317 |
|
---|
1318 | // Get canonical form
|
---|
1319 | String canForm = canonicalForm((HashMap)info.elementAt(2));
|
---|
1320 | if (canForm == null) {
|
---|
1321 | phrases.remove();
|
---|
1322 | } else {
|
---|
1323 | info.setElementAt(canForm, 2);
|
---|
1324 | }
|
---|
1325 | }
|
---|
1326 | return pos;
|
---|
1327 | }
|
---|
1328 |
|
---|
1329 | /**
|
---|
1330 | * Create canonical form of phrase.
|
---|
1331 | */
|
---|
1332 | private String canonicalForm(HashMap secHash) {
|
---|
1333 |
|
---|
1334 | int max = 0; String bestVersion = null;
|
---|
1335 | boolean allFullyHyphenated = true;
|
---|
1336 | int num = 0;
|
---|
1337 | Iterator versions = secHash.keySet().iterator();
|
---|
1338 | while (versions.hasNext()) {
|
---|
1339 | num++;
|
---|
1340 | String version = (String)versions.next();
|
---|
1341 |
|
---|
1342 | // Are all the words joined up?
|
---|
1343 | if (!isFullyHyphenated(version)) {
|
---|
1344 | allFullyHyphenated = false;
|
---|
1345 | }
|
---|
1346 |
|
---|
1347 | // Check for how often this version occurs
|
---|
1348 | Counter count = (Counter)secHash.get(version);
|
---|
1349 | if (count.value() > max) {
|
---|
1350 | max = count.value();
|
---|
1351 | bestVersion = version;
|
---|
1352 | }
|
---|
1353 | }
|
---|
1354 | if ((getCheckForProperNouns()) && (num == 1) && properNoun(bestVersion)) {
|
---|
1355 | //if (allProperNouns) {
|
---|
1356 | return null;
|
---|
1357 | } else {
|
---|
1358 | if (isFullyHyphenated(bestVersion) &&
|
---|
1359 | !allFullyHyphenated) {
|
---|
1360 | bestVersion = bestVersion.replace('-', ' ');
|
---|
1361 | }
|
---|
1362 | return bestVersion;
|
---|
1363 | }
|
---|
1364 | }
|
---|
1365 |
|
---|
1366 | /**
|
---|
1367 | * Checks whether the given phrase is
|
---|
1368 | * fully hyphenated.
|
---|
1369 | */
|
---|
1370 | private boolean isFullyHyphenated(String str) {
|
---|
1371 |
|
---|
1372 | return (str.indexOf(' ') == -1);
|
---|
1373 | }
|
---|
1374 |
|
---|
1375 | /**
|
---|
1376 | * Checks whether the given string is a
|
---|
1377 | * proper noun.
|
---|
1378 | *
|
---|
1379 | * @return true if it is a potential proper noun
|
---|
1380 | */
|
---|
1381 | private static boolean properNoun(String str) {
|
---|
1382 |
|
---|
1383 | // Is it more than one word?
|
---|
1384 | if (str.indexOf(' ') != -1) {
|
---|
1385 | return false;
|
---|
1386 | }
|
---|
1387 |
|
---|
1388 | // Does it start with an upper-case character?
|
---|
1389 | if (Character.isLowerCase(str.charAt(0))) {
|
---|
1390 | return false;
|
---|
1391 | }
|
---|
1392 |
|
---|
1393 | // Is there at least one character that's
|
---|
1394 | // not upper-case?
|
---|
1395 | for (int i = 1; i < str.length(); i++) {
|
---|
1396 | /*if (Character.isUpperCase(str.charAt(i))) {
|
---|
1397 | return false;
|
---|
1398 | }*/
|
---|
1399 | if (!Character.isUpperCase(str.charAt(i))) {
|
---|
1400 | return true;
|
---|
1401 | }
|
---|
1402 | }
|
---|
1403 | //return true;
|
---|
1404 | return false;
|
---|
1405 | }
|
---|
1406 |
|
---|
1407 | /**
|
---|
1408 | * Generates the evaluation format of a phrase.
|
---|
1409 | */
|
---|
1410 | private String evalFormat(String str) {
|
---|
1411 |
|
---|
1412 | return m_Stemmer.stemString(str);
|
---|
1413 | }
|
---|
1414 |
|
---|
1415 | /**
|
---|
1416 | * Generates the internal format of a phrase.
|
---|
1417 | */
|
---|
1418 | private String internalFormat(String str) {
|
---|
1419 |
|
---|
1420 | // Remove some non-alphanumeric characters
|
---|
1421 | str = str.replace('-', ' ');
|
---|
1422 | str = str.replace('/', ' ');
|
---|
1423 | str = str.replace('&', ' ');
|
---|
1424 |
|
---|
1425 | // Stem string
|
---|
1426 | return m_Stemmer.stemString(str);
|
---|
1427 | }
|
---|
1428 |
|
---|
1429 | /**
|
---|
1430 | * Gets all the phrases in the given string and puts them into the
|
---|
1431 | * hashtable. Also stores the original version of the stemmed
|
---|
1432 | * phrase in the hash table.
|
---|
1433 | */
|
---|
1434 | private HashMap getGivenKeyphrases(String str,
|
---|
1435 | boolean forEval) {
|
---|
1436 |
|
---|
1437 | FastVector vector = new FastVector();
|
---|
1438 | HashMap hash = new HashMap();
|
---|
1439 |
|
---|
1440 | StringTokenizer tok = new StringTokenizer(str, "\n");
|
---|
1441 | while (tok.hasMoreTokens()) {
|
---|
1442 | String orig = tok.nextToken();
|
---|
1443 | orig = orig.trim();
|
---|
1444 | if (orig.length() > 0) {
|
---|
1445 | String modified;
|
---|
1446 | if (!forEval) {
|
---|
1447 | modified = internalFormat(orig);
|
---|
1448 | } else {
|
---|
1449 | modified = evalFormat(orig);
|
---|
1450 | }
|
---|
1451 | if (!hash.containsKey(modified)) {
|
---|
1452 | hash.put(modified, orig);
|
---|
1453 | } else {
|
---|
1454 | if (forEval) {
|
---|
1455 | System.err.println("WARNING: stem of author-assigned keyphrase " +
|
---|
1456 | orig + " matches existing stem (skipping it)!");
|
---|
1457 | }
|
---|
1458 | }
|
---|
1459 | }
|
---|
1460 | }
|
---|
1461 | if (hash.size() == 0) {
|
---|
1462 | return null;
|
---|
1463 | } else {
|
---|
1464 | return hash;
|
---|
1465 | }
|
---|
1466 | }
|
---|
1467 |
|
---|
1468 | /**
|
---|
1469 | * Main method for testing this class.
|
---|
1470 | *
|
---|
1471 | * @param argv should contain arguments to the filter: use -h for help
|
---|
1472 | */
|
---|
1473 | public static void main(String [] argv) {
|
---|
1474 |
|
---|
1475 | try {
|
---|
1476 | if (Utils.getFlag('b', argv)) {
|
---|
1477 | Filter.batchFilterFile(new KEAFilter(), argv);
|
---|
1478 | } else {
|
---|
1479 | Filter.filterFile(new KEAFilter(), argv);
|
---|
1480 | }
|
---|
1481 | } catch (Exception ex) {
|
---|
1482 | System.out.println(ex.getMessage());
|
---|
1483 | }
|
---|
1484 | }
|
---|
1485 | }
|
---|
1486 |
|
---|
1487 |
|
---|
1488 |
|
---|
1489 |
|
---|
1490 |
|
---|
1491 |
|
---|
1492 |
|
---|
1493 |
|
---|
1494 |
|
---|
1495 |
|
---|
1496 |
|
---|