Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/KEAFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 41.3 KB

Line
1	/*
2	* KEAFilter.java
3	* Copyright (C) 2000, 2001 Eibe Frank
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19
20	import java.util.*;
21	import java.io.*;
22
23	import weka.core.*;
24	import weka.filters.*;
25	import weka.classifiers.*;
26
27	/**
28	* This filter converts the incoming data into data appropriate for
29	* keyphrase classification. It assumes that the dataset contains two
30	* string attributes. The first attribute should contain the text of a
31	* document. The second attribute should contain the keyphrases
32	* associated with that document (if present).
33	*
34	* The filter converts every instance (i.e. document) into a set of
35	* instances, one for each word-based n-gram in the document. The
36	* string attribute representing the document is replaced by some
37	* numeric features, the estimated probability of each n-gram being a
38	* keyphrase, and the rank of this phrase in the document according to
39	* the probability. Each new instances also has a class value
40	* associated with it. The class is "true" if the n-gram is a true
41	* keyphrase, and "false" otherwise. Of course, if the input document
42	* doesn't come with author-assigned keyphrases, the class values for
43	* that document will be missing.
44	*
45	* @author Eibe Frank ([email protected])
46	* @version 1.0
47	*/
48	public class KEAFilter extends Filter implements OptionHandler {
49
50	/** Index of attribute containing the documents */
51	private int m_DocumentAtt = 0;
52
53	/** Index of attribute containing the keyphrases */
54	private int m_KeyphrasesAtt = 1;
55
56	/** The maximum length of phrases */
57	private int m_MaxPhraseLength = 3;
58
59	/** The minimum length of phrases */
60	private int m_MinPhraseLength = 1;
61
62	/** Is keyphrase frequency attribute being used? */
63	private boolean m_KFused = false;
64
65	/** Flag for debugging mode */
66	private boolean m_Debug = false;
67
68	/** Determines whether internal periods are allowed */
69	private boolean m_DisallowInternalPeriods = false;
70
71	/** The minimum number of occurences of a phrase */
72	private int m_MinNumOccur = 2;
73
74	/** The number of features describing a phrase */
75	private int m_NumFeatures = 2;
76
77	/* Indices of attributes in m_ClassifierData */
78	private int m_TfidfIndex = 0;
79	private int m_FirstOccurIndex = 1;
80	private int m_KeyFreqIndex = 2;
81
82	/** The punctuation filter used by this filter */
83	private KEAPhraseFilter m_PunctFilter = null;
84
85	/** The numbers filter used by this filter */
86	private NumbersFilter m_NumbersFilter = null;
87
88	/** The actual classifier used to compute probabilities */
89	private DistributionClassifier m_Classifier = null;
90
91	/** The dictionary containing the document frequencies */
92	private HashMap m_Dictionary = null;
93
94	/** The dictionary containing the keyphrases */
95	private HashMap m_KeyphraseDictionary = null;
96
97	/** The number of documents in the global frequencies corpus */
98	private int m_NumDocs = 0;
99
100	/** Template for the classifier data */
101	private Instances m_ClassifierData = null;
102
103	/** The stemmer to be used */
104	private Stemmer m_Stemmer = new IteratedLovinsStemmer();
105
106	/** The list of stop words to be used */
107	private Stopwords m_Stopwords = new StopwordsEnglish();
108
109	/** Determines whether check for proper nouns is performed */
110	private boolean m_CheckForProperNouns = true;
111
112	/**
113	* Get the M_CheckProperNouns value.
114	* @return the M_CheckProperNouns value.
115	*/
116	public boolean getCheckForProperNouns() {
117	return m_CheckForProperNouns;
118	}
119
120	/**
121	* Set the M_CheckProperNouns value.
122	* @param newM_CheckProperNouns The new M_CheckProperNouns value.
123	*/
124	public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
125	this.m_CheckForProperNouns = newM_CheckProperNouns;
126	}
127
128	/**
129	* Get the M_Stopwords value.
130	* @return the M_Stopwords value.
131	*/
132	public Stopwords getStopwords() {
133	return m_Stopwords;
134	}
135
136	/**
137	* Set the M_Stopwords value.
138	* @param newM_Stopwords The new M_Stopwords value.
139	*/
140	public void setStopwords(Stopwords newM_Stopwords) {
141	this.m_Stopwords = newM_Stopwords;
142	}
143
144
145	/**
146	* Get the Stemmer value.
147	* @return the Stemmer value.
148	*/
149	public Stemmer getStemmer() {
150
151	return m_Stemmer;
152	}
153
154	/**
155	* Set the Stemmer value.
156	* @param newStemmer The new Stemmer value.
157	*/
158	public void setStemmer(Stemmer newStemmer) {
159
160	this.m_Stemmer = newStemmer;
161	}
162
163	/**
164	* Get the value of MinNumOccur.
165	*
166	* @return Value of MinNumOccur.
167	*/
168	public int getMinNumOccur() {
169
170	return m_MinNumOccur;
171	}
172
173	/**
174	* Set the value of MinNumOccur.
175	*
176	* @param newMinNumOccur Value to assign to MinNumOccur.
177	*/
178	public void setMinNumOccur(int newMinNumOccur) {
179
180	m_MinNumOccur = newMinNumOccur;
181	}
182
183	/**
184	* Get the value of MaxPhraseLength.
185	*
186	* @return Value of MaxPhraseLength.
187	*/
188	public int getMaxPhraseLength() {
189
190	return m_MaxPhraseLength;
191	}
192
193	/**
194	* Set the value of MaxPhraseLength.
195	*
196	* @param newMaxPhraseLength Value to assign to MaxPhraseLength.
197	*/
198	public void setMaxPhraseLength(int newMaxPhraseLength) {
199
200	m_MaxPhraseLength = newMaxPhraseLength;
201	}
202
203	/**
204	* Get the value of MinPhraseLength.
205	*
206	* @return Value of MinPhraseLength.
207	*/
208	public int getMinPhraseLength() {
209
210	return m_MinPhraseLength;
211	}
212
213	/**
214	* Set the value of MinPhraseLength.
215	*
216	* @param newMinPhraseLength Value to assign to MinPhraseLength.
217	*/
218	public void setMinPhraseLength(int newMinPhraseLength) {
219
220	m_MinPhraseLength = newMinPhraseLength;
221	}
222
223	/**
224	* Returns the index of the stemmed phrases in the output ARFF file.
225	*/
226	public int getStemmedPhraseIndex() {
227
228	return m_DocumentAtt;
229	}
230
231	/**
232	* Returns the index of the unstemmed phrases in the output ARFF file.
233	*/
234	public int getUnstemmedPhraseIndex() {
235
236	return m_DocumentAtt + 1;
237	}
238
239	/**
240	* Returns the index of the phrases' probabilities in the output ARFF file.
241	*/
242	public int getProbabilityIndex() {
243
244	int index = m_DocumentAtt + 4;
245
246	if (m_Debug) {
247	if (m_KFused) {
248	index++;
249	}
250	}
251	return index;
252	}
253
254	/**
255	* Returns the index of the phrases' ranks in the output ARFF file.
256	*/
257	public int getRankIndex() {
258
259	return getProbabilityIndex() + 1;
260	}
261
262	/**
263	* Get the value of DocumentAtt.
264	*
265	* @return Value of DocumentAtt.
266	*/
267	public int getDocumentAtt() {
268
269	return m_DocumentAtt;
270	}
271
272	/**
273	* Set the value of DocumentAtt.
274	*
275	* @param newDocumentAtt Value to assign to DocumentAtt.
276	*/
277	public void setDocumentAtt(int newDocumentAtt) {
278
279	m_DocumentAtt = newDocumentAtt;
280	}
281
282	/**
283	* Get the value of KeyphraseAtt.
284	*
285	* @return Value of KeyphraseAtt.
286	*/
287	public int getKeyphrasesAtt() {
288
289	return m_KeyphrasesAtt;
290	}
291
292	/**
293	* Set the value of KeyphrasesAtt.
294	*
295	* @param newKeyphrasesAtt Value to assign to KeyphrasesAtt.
296	*/
297	public void setKeyphrasesAtt(int newKeyphrasesAtt) {
298
299	m_KeyphrasesAtt = newKeyphrasesAtt;
300	}
301
302
303	/**
304	* Get the value of Debug.
305	*
306	* @return Value of Debug.
307	*/
308	public boolean getDebug() {
309
310	return m_Debug;
311	}
312
313	/**
314	* Set the value of Debug.
315	*
316	* @param newDebug Value to assign to Debug.
317	*/
318	public void setDebug(boolean newDebug) {
319
320	m_Debug = newDebug;
321	}
322
323	/**
324	* Sets whether keyphrase frequency attribute is used.
325	*/
326	public void setKFused(boolean flag) {
327
328	m_KFused = flag;
329	if (flag) {
330	m_NumFeatures = 3;
331	} else {
332	m_NumFeatures = 2;
333	}
334	}
335
336	/**
337	* Gets whether keyphrase frequency attribute is used.
338	*/
339	public boolean getKFused() {
340
341	return m_KFused;
342	}
343
344	/**
345	* Get whether the supplied columns are to be processed
346	*
347	* @return true if the supplied columns won't be processed
348	*/
349	public boolean getDisallowInternalPeriods() {
350
351	return m_DisallowInternalPeriods;
352	}
353
354	/**
355	* Set whether selected columns should be processed. If true the
356	* selected columns won't be processed.
357	*
358	* @param invert the new invert setting
359	*/
360	public void setDisallowInternalPeriods(boolean disallow) {
361
362	m_DisallowInternalPeriods = disallow;
363	}
364
365	/**
366	* Parses a given list of options controlling the behaviour of this object.
367	* Valid options are:<p>
368	*
369	* -K<br>
370	* Specifies whether keyphrase frequency statistic is used.<p>
371	*
372	* -M length<br>
373	* Sets the maximum phrase length (default: 3).<p>
374	*
375	* -L length<br>
376	* Sets the minimum phrase length (default: 1).<p>
377	*
378	* -D<br>
379	* Turns debugging mode on.<p>
380	*
381	* -I index<br>
382	* Sets the index of the attribute containing the documents (default: 0).<p>
383	*
384	* -J index<br>
385	* Sets the index of the attribute containing the keyphrases (default: 1).<p>
386	*
387	* -P<br>
388	* Disallow internal periods <p>
389	*
390	* -O number<br>
391	* The minimum number of times a phrase needs to occur (default: 2). <p>
392	*
393	* @param options the list of options as an array of strings
394	* @exception Exception if an option is not supported
395	*/
396	public void setOptions(String[] options) throws Exception {
397
398	setKFused(Utils.getFlag('K', options));
399	setDebug(Utils.getFlag('D', options));
400	String docAttIndexString = Utils.getOption('I', options);
401	if (docAttIndexString.length() > 0) {
402	setDocumentAtt(Integer.parseInt(docAttIndexString) - 1);
403	} else {
404	setDocumentAtt(0);
405	}
406	String keyphraseAttIndexString = Utils.getOption('J', options);
407	if (keyphraseAttIndexString.length() > 0) {
408	setKeyphrasesAtt(Integer.parseInt(keyphraseAttIndexString) - 1);
409	} else {
410	setKeyphrasesAtt(1);
411	}
412	String maxPhraseLengthString = Utils.getOption('M', options);
413	if (maxPhraseLengthString.length() > 0) {
414	setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
415	} else {
416	setMaxPhraseLength(3);
417	}
418	String minPhraseLengthString = Utils.getOption('M', options);
419	if (minPhraseLengthString.length() > 0) {
420	setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
421	} else {
422	setMinPhraseLength(1);
423	}
424	String minNumOccurString = Utils.getOption('O', options);
425	if (minNumOccurString.length() > 0) {
426	setMinNumOccur(Integer.parseInt(minNumOccurString));
427	} else {
428	setMinNumOccur(2);
429	}
430	setDisallowInternalPeriods(Utils.getFlag('P', options));
431	}
432
433	/**
434	* Gets the current settings of the filter.
435	*
436	* @return an array of strings suitable for passing to setOptions
437	*/
438	public String [] getOptions() {
439
440	String [] options = new String [13];
441	int current = 0;
442
443	if (getKFused()) {
444	options[current++] = "-K";
445	}
446	if (getDebug()) {
447	options[current++] = "-D";
448	}
449	options[current++] = "-I";
450	options[current++] = "" + (getDocumentAtt() + 1);
451	options[current++] = "-J";
452	options[current++] = "" + (getKeyphrasesAtt() + 1);
453	options[current++] = "-M";
454	options[current++] = "" + (getMaxPhraseLength());
455	options[current++] = "-L";
456	options[current++] = "" + (getMinPhraseLength());
457	options[current++] = "-O";
458	options[current++] = "" + (getMinNumOccur());
459
460	if (getDisallowInternalPeriods()) {
461	options[current++] = "-P";
462	}
463
464	while (current < options.length) {
465	options[current++] = "";
466	}
467	return options;
468	}
469
470	/**
471	* Returns an enumeration describing the available options
472	*
473	* @return an enumeration of all the available options
474	*/
475	public Enumeration listOptions() {
476
477	Vector newVector = new Vector(7);
478
479	newVector.addElement(new Option(
480	"\tSpecifies whether keyphrase frequency statistic is used.",
481	"K", 0, "-K"));
482	newVector.addElement(new Option(
483	"\tSets the maximum phrase length (default: 3).",
484	"M", 1, "-M <length>"));
485	newVector.addElement(new Option(
486	"\tSets the minimum phrase length (default: 1).",
487	"L", 1, "-L <length>"));
488	newVector.addElement(new Option(
489	"\tTurns debugging mode on.",
490	"D", 0, "-D"));
491	newVector.addElement(new Option(
492	"\tSets the index of the document attribute (default: 0).",
493	"I", 1, "-I"));
494	newVector.addElement(new Option(
495	"\tSets the index of the keyphrase attribute (default: 1).",
496	"J", 1, "-J"));
497	newVector.addElement(new Option(
498	"\tDisallow internal periods.",
499	"P", 0, "-P"));
500	newVector.addElement(new Option(
501	"\tSet the minimum number of occurences (default: 2).",
502	"O", 1, "-O"));
503
504	return newVector.elements();
505	}
506
507	/**
508	* Returns a string describing this filter
509	*
510	* @return a description of the filter suitable for
511	* displaying in the explorer/experimenter gui
512	*/
513	public String globalInfo() {
514	return "Converts incoming data into data appropriate for " +
515	"keyphrase classification.";
516	}
517
518	/**
519	* Sets the format of the input instances.
520	*
521	* @param instanceInfo an Instances object containing the input
522	* instance structure (any instances contained in the object are
523	* ignored - only the structure is required).
524	* @return true if the outputFormat may be collected immediately
525	*/
526	public boolean setInputFormat(Instances instanceInfo) throws Exception {
527
528	if (instanceInfo.classIndex() >= 0) {
529	throw new Exception("Don't know what do to if class index set!");
530	}
531	if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() \|\|
532	!instanceInfo.attribute(m_DocumentAtt).isString()) {
533	throw new Exception("Keyphrase attribute and document attribute " +
534	"need to be string attributes.");
535	}
536	m_PunctFilter = new KEAPhraseFilter();
537	int[] arr = new int[1];
538	arr[0] = m_DocumentAtt;
539	m_PunctFilter.setAttributeIndicesArray(arr);
540	m_PunctFilter.setInputFormat(instanceInfo);
541	m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
542	m_NumbersFilter = new NumbersFilter();
543	m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
544	super.setInputFormat(m_NumbersFilter.getOutputFormat());
545	return false;
546	}
547
548	/**
549	* Input an instance for filtering. Ordinarily the instance is processed
550	* and made available for output immediately. Some filters require all
551	* instances be read before producing output.
552	*
553	* @param instance the input instance
554	* @return true if the filtered instance may now be
555	* collected with output().
556	* @exception Exception if the input instance was not of the correct
557	* format or if there was a problem with the filtering.
558	*/
559	public boolean input(Instance instance) throws Exception {
560
561	if (getInputFormat() == null) {
562	throw new Exception("No input instance format defined");
563	}
564	if (m_NewBatch) {
565	resetQueue();
566	m_NewBatch = false;
567	}
568
569	if (m_Debug) {
570	System.err.println("-- Reading instance");
571	}
572
573	m_PunctFilter.input(instance);
574	m_PunctFilter.batchFinished();
575	instance = m_PunctFilter.output();
576
577	m_NumbersFilter.input(instance);
578	m_NumbersFilter.batchFinished();
579	instance = m_NumbersFilter.output();
580
581	if (m_Dictionary == null) {
582	bufferInput(instance);
583	return false;
584	} else {
585	FastVector vector = convertInstance(instance, false);
586	Enumeration enum = vector.elements();
587	while (enum.hasMoreElements()) {
588	Instance inst = (Instance)enum.nextElement();
589	push(inst);
590	}
591	return true;
592	}
593	}
594
595	/**
596	* Signify that this batch of input to the filter is finished.
597	* If the filter requires all instances prior to filtering,
598	* output() may now be called to retrieve the filtered instances.
599	*
600	* @return true if there are instances pending output
601	* @exception Exception if no input structure has been defined
602	*/
603	public boolean batchFinished() throws Exception {
604
605	if (getInputFormat() == null) {
606	throw new Exception("No input instance format defined");
607	}
608	if (m_Dictionary == null) {
609	buildGlobalDictionaries();
610	buildClassifier();
611	convertPendingInstances();
612	}
613	flushInput();
614	m_NewBatch = true;
615	return (numPendingOutput() != 0);
616	}
617
618	/**
619	* Builds the global dictionaries.
620	*/
621	private void buildGlobalDictionaries() throws Exception {
622
623	if (m_Debug) {
624	System.err.println("--- Building global dictionaries");
625	}
626
627	// Build dictionary of n-grams with associated
628	// document frequencies
629	m_Dictionary = new HashMap();
630	for (int i = 0; i < getInputFormat().numInstances(); i++) {
631	String str = getInputFormat().instance(i).stringValue(m_DocumentAtt);
632	HashMap hash = getPhrasesForDictionary(str);
633	Iterator it = hash.keySet().iterator();
634	while (it.hasNext()) {
635	String phrase = (String)it.next();
636	Counter counter = (Counter)m_Dictionary.get(phrase);
637	if (counter == null) {
638	m_Dictionary.put(phrase, new Counter());
639	} else {
640	counter.increment();
641	}
642	}
643	}
644
645	if (m_KFused) {
646
647	// Build dictionary of n-grams that occur as keyphrases
648	// with associated keyphrase frequencies
649	m_KeyphraseDictionary = new HashMap();
650	for (int i = 0; i < getInputFormat().numInstances(); i++) {
651	String str = getInputFormat().instance(i).stringValue(m_KeyphrasesAtt);
652	HashMap hash = getGivenKeyphrases(str, false);
653	if (hash != null) {
654	Iterator it = hash.keySet().iterator();
655	while (it.hasNext()) {
656	String phrase = (String)it.next();
657	Counter counter = (Counter)m_KeyphraseDictionary.get(phrase);
658	if (counter == null) {
659	m_KeyphraseDictionary.put(phrase, new Counter());
660	} else {
661	counter.increment();
662	}
663	}
664	}
665	}
666	} else {
667	m_KeyphraseDictionary = null;
668	}
669
670	// Set the number of documents in the global corpus
671	m_NumDocs = getInputFormat().numInstances();
672	}
673
674	/**
675	* Builds the classifier.
676	*/
677	private void buildClassifier() throws Exception {
678
679	// Generate input format for classifier
680	FastVector atts = new FastVector();
681	for (int i = 0; i < getInputFormat().numAttributes(); i++) {
682	if (i == m_DocumentAtt) {
683	atts.addElement(new Attribute("TFxIDF"));
684	atts.addElement(new Attribute("First_occurrence"));
685	if (m_KFused) {
686	atts.addElement(new Attribute("Keyphrase_frequency"));
687	}
688	} else if (i == m_KeyphrasesAtt) {
689	FastVector vals = new FastVector(2);
690	vals.addElement("False");
691	vals.addElement("True");
692	atts.addElement(new Attribute("Keyphrase?", vals));
693	}
694	}
695	m_ClassifierData = new Instances("ClassifierData", atts, 0);
696	m_ClassifierData.setClassIndex(m_NumFeatures);
697
698	if (m_Debug) {
699	System.err.println("--- Converting instances for classifier");
700	}
701
702	// Convert pending input instances into data for classifier
703	for(int i = 0; i < getInputFormat().numInstances(); i++) {
704	Instance current = getInputFormat().instance(i);
705
706	// Get the key phrases for the document
707	String keyphrases = current.stringValue(m_KeyphrasesAtt);
708	HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
709	HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);
710
711	// Get the phrases for the document
712	HashMap hash = new HashMap();
713	int length = getPhrases(hash, current.stringValue(m_DocumentAtt));
714
715	// Compute the feature values for each phrase and
716	// add the instance to the data for the classifier
717	Iterator it = hash.keySet().iterator();
718	while (it.hasNext()) {
719	String phrase = (String)it.next();
720	FastVector phraseInfo = (FastVector)hash.get(phrase);
721	double[] vals = featVals(phrase, phraseInfo, true,
722	hashKeysEval, hashKeyphrases, length);
723	Instance inst = new Instance(current.weight(), vals);
724	m_ClassifierData.add(inst);
725	}
726	}
727
728	if (m_Debug) {
729	System.err.println("--- Building classifier");
730	}
731
732	// Build classifier
733	FilteredClassifier fclass = new FilteredClassifier();
734	fclass.setClassifier(new weka.classifiers.NaiveBayesSimple());
735	fclass.setFilter(new DiscretizeFilter());
736	m_Classifier = fclass;
737	m_Classifier.buildClassifier(m_ClassifierData);
738
739	if (m_Debug) {
740	System.err.println(m_Classifier);
741	}
742
743	// Save space
744	m_ClassifierData = new Instances(m_ClassifierData, 0);
745	}
746
747	/**
748	* Conmputes the feature values for a given phrase.
749	*/
750	private double[] featVals(String phrase, FastVector phraseInfo,
751	boolean training, HashMap hashKeysEval,
752	HashMap hashKeyphrases, int length) {
753
754	// Compute feature values
755	Counter counterLocal = (Counter)phraseInfo.elementAt(1);
756	double[] newInst = new double[m_NumFeatures + 1];
757
758	// Compute TFxIDF
759	Counter counterGlobal = (Counter)m_Dictionary.get(phrase);
760	double localVal = counterLocal.value(), globalVal = 0;
761	if (counterGlobal != null) {
762	globalVal = counterGlobal.value();
763	if (training) {
764	globalVal = globalVal - 1;
765	}
766	}
767
768	// Just devide by length to get approximation of probability
769	// that phrase in document is our phrase
770	newInst[m_TfidfIndex] = (localVal / ((double)length)) *
771	(-Math.log((globalVal + 1)/ ((double)m_NumDocs + 1)));
772
773	// Compute first occurrence
774	Counter counterFirst = (Counter)phraseInfo.elementAt(0);
775	newInst[m_FirstOccurIndex] = (double)counterFirst.value() /
776	(double)length;
777
778	// Is keyphrase frequency attribute being used?
779	if (m_KFused) {
780	Counter keyphraseC = (Counter)m_KeyphraseDictionary.get(phrase);
781	if ((training) && (hashKeyphrases != null) &&
782	(hashKeyphrases.containsKey(phrase))) {
783	newInst[m_KeyFreqIndex] = keyphraseC.value() - 1;
784	} else {
785	if (keyphraseC != null) {
786	newInst[m_KeyFreqIndex] = keyphraseC.value();
787	} else {
788	newInst[m_KeyFreqIndex] = 0;
789	}
790	}
791	}
792
793	// Compute class value
794	String phraseInEvalFormat = evalFormat((String)phraseInfo.elementAt(2));
795	if (hashKeysEval == null) { // no author-assigned keyphrases
796	newInst[m_NumFeatures] = Instance.missingValue();
797	} else if (!hashKeysEval.containsKey(phraseInEvalFormat)) {
798	newInst[m_NumFeatures] = 0; // No keyphrase
799	} else {
800	hashKeysEval.remove(phraseInEvalFormat);
801	newInst[m_NumFeatures] = 1; // Keyphrase
802	}
803	return newInst;
804	}
805
806	/**
807	* Sets output format and converts pending input instances.
808	*/
809	private void convertPendingInstances() throws Exception {
810
811	if (m_Debug) {
812	System.err.println("--- Converting pending instances");
813	}
814
815	// Create output format for filter
816	FastVector atts = new FastVector();
817	for (int i = 0; i < getInputFormat().numAttributes(); i++) {
818	if (i == m_DocumentAtt) {
819	atts.addElement(new Attribute("N-gram", null));
820	atts.addElement(new Attribute("N-gram-original", null));
821	atts.addElement(new Attribute("TFxIDF"));
822	atts.addElement(new Attribute("First_occurrence"));
823	if (m_Debug) {
824	if (m_KFused) {
825	atts.addElement(new Attribute("Keyphrase_frequency"));
826	}
827	}
828	atts.addElement(new Attribute("Probability"));
829	atts.addElement(new Attribute("Rank"));
830	} else if (i == m_KeyphrasesAtt) {
831	FastVector vals = new FastVector(2);
832	vals.addElement("False");
833	vals.addElement("True");
834	atts.addElement(new Attribute("Keyphrase?", vals));
835	} else {
836	atts.addElement(getInputFormat().attribute(i));
837	}
838	}
839	Instances outFormat = new Instances("KEAdata", atts, 0);
840	setOutputFormat(outFormat);
841
842	// Convert pending input instances into output data
843	for(int i = 0; i < getInputFormat().numInstances(); i++) {
844	Instance current = getInputFormat().instance(i);
845	FastVector vector = convertInstance(current, true);
846	Enumeration enum = vector.elements();
847	while (enum.hasMoreElements()) {
848	Instance inst = (Instance)enum.nextElement();
849	push(inst);
850	}
851	}
852	}
853
854	/**
855	* Converts an instance.
856	*/
857	private FastVector convertInstance(Instance instance, boolean training)
858	throws Exception {
859
860	FastVector vector = new FastVector();
861
862	if (m_Debug) {
863	System.err.println("-- Converting instance");
864	}
865
866	// Get the key phrases for the document
867	HashMap hashKeyphrases = null;
868	HashMap hashKeysEval = null;
869	if (!instance.isMissing(m_KeyphrasesAtt)) {
870	String keyphrases = instance.stringValue(m_KeyphrasesAtt);
871	hashKeyphrases = getGivenKeyphrases(keyphrases, false);
872	hashKeysEval = getGivenKeyphrases(keyphrases, true);
873	}
874
875	// Get the phrases for the document
876	HashMap hash = new HashMap();
877	int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
878
879	// Compute number of extra attributes
880	int numFeatures = 5;
881	if (m_Debug) {
882	if (m_KFused) {
883	numFeatures = numFeatures + 1;
884	}
885	}
886
887	// Set indices of key attributes
888	int phraseAttIndex = m_DocumentAtt;
889	int tfidfAttIndex = m_DocumentAtt + 2;
890	int distAttIndex = m_DocumentAtt + 3;
891	int probsAttIndex = m_DocumentAtt + numFeatures - 1;
892
893	// Go through the phrases and convert them into instances
894	Iterator it = hash.keySet().iterator();
895	while (it.hasNext()) {
896	String phrase = (String)it.next();
897	FastVector phraseInfo = (FastVector)hash.get(phrase);
898	double[] vals = featVals(phrase, phraseInfo, training,
899	hashKeysEval, hashKeyphrases, length);
900	Instance inst = new Instance(instance.weight(), vals);
901	inst.setDataset(m_ClassifierData);
902
903	// Get probability of phrase being key phrase
904	double[] probs = m_Classifier.distributionForInstance(inst);
905	double prob = probs[1];
906
907	// Compute attribute values for final instance
908	double[] newInst =
909	new double[instance.numAttributes() + numFeatures];
910	int pos = 0;
911	for (int i = 0; i < instance.numAttributes(); i++) {
912	if (i == m_DocumentAtt) {
913
914	// Add phrase
915	int index = outputFormatPeek().attribute(pos).
916	addStringValue(phrase);
917	newInst[pos++] = index;
918
919	// Add original version
920	index = outputFormatPeek().attribute(pos).
921	addStringValue((String)phraseInfo.elementAt(2));
922	newInst[pos++] = index;
923
924	// Add TFxIDF
925	newInst[pos++] = inst.value(m_TfidfIndex);
926
927	// Add distance
928	newInst[pos++] = inst.value(m_FirstOccurIndex);
929
930	// Add other features
931	if (m_Debug) {
932	if (m_KFused) {
933	newInst[pos++] = inst.value(m_KeyFreqIndex);
934	}
935	}
936
937	// Add probability
938	probsAttIndex = pos;
939	newInst[pos++] = prob;
940
941	// Set rank to missing (computed below)
942	newInst[pos++] = Instance.missingValue();
943	} else if (i == m_KeyphrasesAtt) {
944	newInst[pos++] = inst.classValue();
945	} else {
946	newInst[pos++] = instance.value(i);
947	}
948	}
949	Instance ins = new Instance(instance.weight(), newInst);
950	ins.setDataset(outputFormatPeek());
951	vector.addElement(ins);
952	}
953
954	// Add dummy instances for keyphrases that don't occur
955	// in the document
956	if (hashKeysEval != null) {
957	Iterator phrases = hashKeysEval.keySet().iterator();
958	while (phrases.hasNext()) {
959	String phrase = (String)phrases.next();
960	double[] newInst =
961	new double[instance.numAttributes() + numFeatures];
962	int pos = 0;
963	for (int i = 0; i < instance.numAttributes(); i++) {
964	if (i == m_DocumentAtt) {
965
966	// Add phrase
967	int index = outputFormatPeek().attribute(pos).
968	addStringValue(phrase);
969	newInst[pos++] = (double)index;
970
971	// Add original version
972	index = outputFormatPeek().attribute(pos).
973	addStringValue((String)hashKeysEval.get(phrase));
974	newInst[pos++] = (double)index;
975
976	// Add TFxIDF
977	newInst[pos++] = Instance.missingValue();
978
979	// Add distance
980	newInst[pos++] = Instance.missingValue();
981
982	// Add other features
983	if (m_Debug) {
984	if (m_KFused) {
985	newInst[pos++] = Instance.missingValue();
986	}
987	}
988
989	// Add probability and rank
990	newInst[pos++] = -Double.MAX_VALUE;
991	newInst[pos++] = Instance.missingValue();
992	} else if (i == m_KeyphrasesAtt) {
993	newInst[pos++] = 1; // Keyphrase
994	} else {
995	newInst[pos++] = instance.value(i);
996	}
997	}
998	Instance inst = new Instance(instance.weight(), newInst);
999	inst.setDataset(outputFormatPeek());
1000	vector.addElement(inst);
1001	}
1002	}
1003
1004	// Sort phrases according to their distance (stable sort)
1005	double[] vals = new double[vector.size()];
1006	for (int i = 0; i < vals.length; i++) {
1007	vals[i] = ((Instance)vector.elementAt(i)).value(distAttIndex);
1008	}
1009	FastVector newVector = new FastVector(vector.size());
1010	int[] sortedIndices = Utils.stableSort(vals);
1011	for (int i = 0; i < vals.length; i++) {
1012	newVector.addElement(vector.elementAt(sortedIndices[i]));
1013	}
1014	vector = newVector;
1015
1016	// Sort phrases according to their tfxidf value (stable sort)
1017	for (int i = 0; i < vals.length; i++) {
1018	vals[i] = -((Instance)vector.elementAt(i)).value(tfidfAttIndex);
1019	}
1020	newVector = new FastVector(vector.size());
1021	sortedIndices = Utils.stableSort(vals);
1022	for (int i = 0; i < vals.length; i++) {
1023	newVector.addElement(vector.elementAt(sortedIndices[i]));
1024	}
1025	vector = newVector;
1026
1027	// Sort phrases according to their probability (stable sort)
1028	for (int i = 0; i < vals.length; i++) {
1029	vals[i] = 1 - ((Instance)vector.elementAt(i)).value(probsAttIndex);
1030	}
1031	newVector = new FastVector(vector.size());
1032	sortedIndices = Utils.stableSort(vals);
1033	for (int i = 0; i < vals.length; i++) {
1034	newVector.addElement(vector.elementAt(sortedIndices[i]));
1035	}
1036	vector = newVector;
1037
1038	// Compute rank of phrases. Check for subphrases that are ranked
1039	// lower than superphrases and assign probability -1 and set the
1040	// rank to Integer.MAX_VALUE
1041	int rank = 1;
1042	for (int i = 0; i < vals.length; i++) {
1043	Instance currentInstance = (Instance)vector.elementAt(i);
1044
1045	// Short cut: if phrase very unlikely make rank very low and continue
1046	if (Utils.grOrEq(vals[i], 1.0)) {
1047	currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
1048	continue;
1049	}
1050
1051	// Otherwise look for super phrase starting with first phrase
1052	// in list that has same probability, TFxIDF value, and distance as
1053	// current phrase. We do this to catch all superphrases
1054	// that have same probability, TFxIDF value and distance as current phrase.
1055	int startInd = i;
1056	while (startInd < vals.length) {
1057	Instance inst = (Instance)vector.elementAt(startInd);
1058	if ((inst.value(tfidfAttIndex) !=
1059	currentInstance.value(tfidfAttIndex)) \|\|
1060	(inst.value(probsAttIndex) !=
1061	currentInstance.value(probsAttIndex)) \|\|
1062	(inst.value(distAttIndex) !=
1063	currentInstance.value(distAttIndex))) {
1064	break;
1065	}
1066	startInd++;
1067	}
1068	String val = currentInstance.stringValue(phraseAttIndex);
1069	boolean foundSuperphrase = false;
1070	for (int j = startInd - 1; j >= 0; j--) {
1071	if (j != i) {
1072	Instance candidate = (Instance)vector.elementAt(j);
1073	String potSuperphrase = candidate.stringValue(phraseAttIndex);
1074	if (val.length() <= potSuperphrase.length()) {
1075	if (KEAFilter.contains(val, potSuperphrase)) {
1076	foundSuperphrase = true;
1077	break;
1078	}
1079	}
1080	}
1081	}
1082	if (foundSuperphrase) {
1083	currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
1084	} else {
1085	currentInstance.setValue(probsAttIndex + 1, rank++);
1086	}
1087	}
1088	return vector;
1089	}
1090
1091	/**
1092	* Checks whether one phrase is a subphrase of another phrase.
1093	*/
1094	private static boolean contains(String sub, String sup) {
1095
1096	int i = 0;
1097	while (i + sub.length() - 1 < sup.length()) {
1098	int j;
1099	for (j = 0; j < sub.length(); j++) {
1100	if (sub.charAt(j) != sup.charAt(i + j)) {
1101	break;
1102	}
1103	}
1104	if (j == sub.length()) {
1105	if ((i + j) < sup.length()) {
1106	if (sup.charAt(i + j) == ' ') {
1107	return true;
1108	} else {
1109	return false;
1110	}
1111	} else {
1112	return true;
1113	}
1114	}
1115
1116	// Skip forward to next space
1117	do {
1118	i++;
1119	} while ((i < sup.length()) && (sup.charAt(i) != ' '));
1120	i++;
1121	}
1122	return false;
1123	}
1124
1125	/**
1126	* Returns a hashtable. Fills the hashtable
1127	* with the stemmed n-grams occuring in the given string
1128	* (as keys) and the number of times it occurs.
1129	*/
1130	private HashMap getPhrasesForDictionary(String str) {
1131
1132	String[] buffer = new String[m_MaxPhraseLength];
1133	HashMap hash = new HashMap();
1134
1135	StringTokenizer tok = new StringTokenizer(str, "\n");
1136	while (tok.hasMoreTokens()) {
1137	String phrase = tok.nextToken();
1138	int numSeen = 0;
1139	StringTokenizer wordTok = new StringTokenizer(phrase, " ");
1140	while (wordTok.hasMoreTokens()) {
1141	String word = wordTok.nextToken();
1142
1143	// Store word in buffer
1144	for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
1145	buffer[i] = buffer[i + 1];
1146	}
1147	buffer[m_MaxPhraseLength - 1] = word;
1148
1149	// How many are buffered?
1150	numSeen++;
1151	if (numSeen > m_MaxPhraseLength) {
1152	numSeen = m_MaxPhraseLength;
1153	}
1154
1155	// Don't consider phrases that end with a stop word
1156	if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
1157	continue;
1158	}
1159
1160	// Loop through buffer and add phrases to hashtable
1161	StringBuffer phraseBuffer = new StringBuffer();
1162	for (int i = 1; i <= numSeen; i++) {
1163	if (i > 1) {
1164	phraseBuffer.insert(0, ' ');
1165	}
1166	phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
1167
1168	// Don't consider phrases that begin with a stop word
1169	if ((i > 1) &&
1170	(m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
1171	continue;
1172	}
1173
1174	// Only consider phrases with minimum length
1175	if (i >= m_MinPhraseLength) {
1176
1177	// Stem string
1178	String orig = phraseBuffer.toString();
1179	String internal = internalFormat(orig);
1180	Counter count = (Counter)hash.get(internal);
1181	if (count == null) {
1182	hash.put(internal, new Counter());
1183	} else {
1184	count.increment();
1185	}
1186
1187	// Add components if phrase is single word before
1188	// conversion into internal format (i.e. error-correcting)
1189	/*if ((orig.indexOf(' ') == -1) &&
1190	(internal.indexOf(' ') != -1)) {
1191	StringTokenizer tokW = new StringTokenizer(internal, " ");
1192	while (tokW.hasMoreTokens()) {
1193	String comp = (String)tokW.nextToken();
1194	Counter countW = (Counter)hash.get(comp);
1195	if (countW == null) {
1196	hash.put(comp, new Counter());
1197	} else {
1198	countW.increment();
1199	}
1200	}
1201	}*/
1202	}
1203	}
1204	}
1205	}
1206	return hash;
1207	}
1208
1209	/**
1210	* Expects an empty hashtable. Fills the hashtable
1211	* with the stemmed n-grams occuring in the given string
1212	* (as keys). Stores the position, the number of occurences,
1213	* and the most commonly occurring orgininal version of
1214	* each n-gram.
1215	*
1216	* N-grams that occur less than m_MinNumOccur are not used.
1217	*
1218	* Returns the total number of words (!) in the string.
1219	*/
1220	private int getPhrases(HashMap hash, String str) {
1221
1222	String[] buffer = new String[m_MaxPhraseLength];
1223
1224	StringTokenizer tok = new StringTokenizer(str, "\n");
1225	int pos = 1;
1226	while (tok.hasMoreTokens()) {
1227	String phrase = tok.nextToken();
1228	int numSeen = 0;
1229	StringTokenizer wordTok = new StringTokenizer(phrase, " ");
1230	while (wordTok.hasMoreTokens()) {
1231	String word = wordTok.nextToken();
1232
1233	// Store word in buffer
1234	for (int i = 0; i < m_MaxPhraseLength - 1; i++) {
1235	buffer[i] = buffer[i + 1];
1236	}
1237	buffer[m_MaxPhraseLength - 1] = word;
1238
1239	// How many are buffered?
1240	numSeen++;
1241	if (numSeen > m_MaxPhraseLength) {
1242	numSeen = m_MaxPhraseLength;
1243	}
1244
1245	// Don't consider phrases that end with a stop word
1246	if (m_Stopwords.isStopword(buffer[m_MaxPhraseLength - 1])) {
1247	pos++;
1248	continue;
1249	}
1250
1251	// Loop through buffer and add phrases to hashtable
1252	StringBuffer phraseBuffer = new StringBuffer();
1253	for (int i = 1; i <= numSeen; i++) {
1254	if (i > 1) {
1255	phraseBuffer.insert(0, ' ');
1256	}
1257	phraseBuffer.insert(0, buffer[m_MaxPhraseLength - i]);
1258
1259	// Don't consider phrases that begin with a stop word
1260	if ((i > 1) &&
1261	(m_Stopwords.isStopword(buffer[m_MaxPhraseLength - i]))) {
1262	continue;
1263	}
1264
1265	// Only consider phrases with minimum length
1266	if (i >= m_MinPhraseLength) {
1267
1268	// Stem string
1269	String phrStr = phraseBuffer.toString();
1270	String internal = internalFormat(phrStr);
1271	FastVector vec = (FastVector)hash.get(internal);
1272	if (vec == null) {
1273	vec = new FastVector(3);
1274
1275	// HashMap for storing all versions
1276	HashMap secHash = new HashMap();
1277	secHash.put(phrStr, new Counter());
1278
1279	// Update hashtable with all the info
1280	vec.addElement(new Counter(pos + 1 - i));
1281	vec.addElement(new Counter());
1282	vec.addElement(secHash);
1283	hash.put(internal, vec);
1284	} else {
1285
1286	// Update number of occurrences
1287	((Counter)((FastVector)vec).elementAt(1)).increment();
1288
1289	// Update hashtable storing different versions
1290	HashMap secHash = (HashMap)vec.elementAt(2);
1291	Counter count = (Counter)secHash.get(phrStr);
1292	if (count == null) {
1293	secHash.put(phrStr, new Counter());
1294	} else {
1295	count.increment();
1296	}
1297	}
1298	}
1299	}
1300	pos++;
1301	}
1302	}
1303
1304	// Replace secondary hashtables with most commonly occurring
1305	// version of each phrase (canonical) form. Delete all words
1306	// that are proper nouns.
1307	Iterator phrases = hash.keySet().iterator();
1308	while (phrases.hasNext()) {
1309	String phrase = (String)phrases.next();
1310	FastVector info = (FastVector)hash.get(phrase);
1311
1312	// Occurring less than m_MinNumOccur?
1313	if (((Counter)((FastVector)info).elementAt(1)).value() < m_MinNumOccur) {
1314	phrases.remove();
1315	continue;
1316	}
1317
1318	// Get canonical form
1319	String canForm = canonicalForm((HashMap)info.elementAt(2));
1320	if (canForm == null) {
1321	phrases.remove();
1322	} else {
1323	info.setElementAt(canForm, 2);
1324	}
1325	}
1326	return pos;
1327	}
1328
1329	/**
1330	* Create canonical form of phrase.
1331	*/
1332	private String canonicalForm(HashMap secHash) {
1333
1334	int max = 0; String bestVersion = null;
1335	boolean allFullyHyphenated = true;
1336	int num = 0;
1337	Iterator versions = secHash.keySet().iterator();
1338	while (versions.hasNext()) {
1339	num++;
1340	String version = (String)versions.next();
1341
1342	// Are all the words joined up?
1343	if (!isFullyHyphenated(version)) {
1344	allFullyHyphenated = false;
1345	}
1346
1347	// Check for how often this version occurs
1348	Counter count = (Counter)secHash.get(version);
1349	if (count.value() > max) {
1350	max = count.value();
1351	bestVersion = version;
1352	}
1353	}
1354	if ((getCheckForProperNouns()) && (num == 1) && properNoun(bestVersion)) {
1355	//if (allProperNouns) {
1356	return null;
1357	} else {
1358	if (isFullyHyphenated(bestVersion) &&
1359	!allFullyHyphenated) {
1360	bestVersion = bestVersion.replace('-', ' ');
1361	}
1362	return bestVersion;
1363	}
1364	}
1365
1366	/**
1367	* Checks whether the given phrase is
1368	* fully hyphenated.
1369	*/
1370	private boolean isFullyHyphenated(String str) {
1371
1372	return (str.indexOf(' ') == -1);
1373	}
1374
1375	/**
1376	* Checks whether the given string is a
1377	* proper noun.
1378	*
1379	* @return true if it is a potential proper noun
1380	*/
1381	private static boolean properNoun(String str) {
1382
1383	// Is it more than one word?
1384	if (str.indexOf(' ') != -1) {
1385	return false;
1386	}
1387
1388	// Does it start with an upper-case character?
1389	if (Character.isLowerCase(str.charAt(0))) {
1390	return false;
1391	}
1392
1393	// Is there at least one character that's
1394	// not upper-case?
1395	for (int i = 1; i < str.length(); i++) {
1396	/*if (Character.isUpperCase(str.charAt(i))) {
1397	return false;
1398	}*/
1399	if (!Character.isUpperCase(str.charAt(i))) {
1400	return true;
1401	}
1402	}
1403	//return true;
1404	return false;
1405	}
1406
1407	/**
1408	* Generates the evaluation format of a phrase.
1409	*/
1410	private String evalFormat(String str) {
1411
1412	return m_Stemmer.stemString(str);
1413	}
1414
1415	/**
1416	* Generates the internal format of a phrase.
1417	*/
1418	private String internalFormat(String str) {
1419
1420	// Remove some non-alphanumeric characters
1421	str = str.replace('-', ' ');
1422	str = str.replace('/', ' ');
1423	str = str.replace('&', ' ');
1424
1425	// Stem string
1426	return m_Stemmer.stemString(str);
1427	}
1428
1429	/**
1430	* Gets all the phrases in the given string and puts them into the
1431	* hashtable. Also stores the original version of the stemmed
1432	* phrase in the hash table.
1433	*/
1434	private HashMap getGivenKeyphrases(String str,
1435	boolean forEval) {
1436
1437	FastVector vector = new FastVector();
1438	HashMap hash = new HashMap();
1439
1440	StringTokenizer tok = new StringTokenizer(str, "\n");
1441	while (tok.hasMoreTokens()) {
1442	String orig = tok.nextToken();
1443	orig = orig.trim();
1444	if (orig.length() > 0) {
1445	String modified;
1446	if (!forEval) {
1447	modified = internalFormat(orig);
1448	} else {
1449	modified = evalFormat(orig);
1450	}
1451	if (!hash.containsKey(modified)) {
1452	hash.put(modified, orig);
1453	} else {
1454	if (forEval) {
1455	System.err.println("WARNING: stem of author-assigned keyphrase " +
1456	orig + " matches existing stem (skipping it)!");
1457	}
1458	}
1459	}
1460	}
1461	if (hash.size() == 0) {
1462	return null;
1463	} else {
1464	return hash;
1465	}
1466	}
1467
1468	/**
1469	* Main method for testing this class.
1470	*
1471	* @param argv should contain arguments to the filter: use -h for help
1472	*/
1473	public static void main(String [] argv) {
1474
1475	try {
1476	if (Utils.getFlag('b', argv)) {
1477	Filter.batchFilterFile(new KEAFilter(), argv);
1478	} else {
1479	Filter.filterFile(new KEAFilter(), argv);
1480	}
1481	} catch (Exception ex) {
1482	System.out.println(ex.getMessage());
1483	}
1484	}
1485	}
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496

Note: See TracBrowser for help on using the repository browser.

Download in other formats: