Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/KEAModelBuilder.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.9 KB

Line
1	/*
2	* KEAModelBuilder.java
3	* Copyright (C) 2001 Eibe Frank
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19
20	import java.io.*;
21	import java.util.*;
22	import weka.core.*;
23	import weka.filters.*;
24
25	/**
26	* Builds a keyphrase extraction model from the documents in a given
27	* directory. Assumes that the file names for the documents end with
28	* ".txt". Assumes that files containing corresponding
29	* author-assigned keyphrases end with ".key". Optionally an encoding
30	* for the documents/keyphrases can be defined (e.g. for Chinese
31	* text).
32	*
33	* Valid options are:<p>
34	*
35	* -l "directory name"<br>
36	* Specifies name of directory.<p>
37	*
38	* -m "model name"<br>
39	* Specifies name of model.<p>
40	*
41	* -e "encoding"<br>
42	* Specifies encoding.<p>
43	*
44	* -d<br>
45	* Turns debugging mode on.<p>
46	*
47	* -k<br>
48	* Use keyphrase frequency statistic.<p>
49	*
50	* -p<br>
51	* Disallow internal periods.<p>
52	*
53	* -x "length"<br>
54	* Sets maximum phrase length (default: 3).<p>
55	*
56	* -y "length"<br>
57	* Sets minimum phrase length (default: 1).<p>
58	*
59	* -o "number"<br>
60	* The minimum number of times a phrase needs to occur (default: 2). <p>
61	*
62	* -s "name of class implementing list of stop words"<br>
63	* Sets list of stop words to used (default: StopwordsEnglish).<p>
64	*
65	* -t "name of class implementing stemmer"<br>
66	* Sets stemmer to use (default: IteratedLovinsStemmer). <p>
67	*
68	* -n<br>
69	* Do not check for proper nouns. <p>
70	*
71	* @author Eibe Frank ([email protected])
72	* @version 1.0
73	*/
74	public class KEAModelBuilder implements OptionHandler {
75
76	/** Name of directory */
77	String m_dirName = null;
78
79	/** Name of model */
80	String m_modelName = null;
81
82	/** Encoding */
83	String m_encoding = "default";
84
85	/** Debugging mode? */
86	boolean m_debug = false;
87
88	/** Use keyphrase frequency attribute? */
89	boolean m_useKFrequency = false;
90
91	/** Disallow internal periods? */
92	boolean m_disallowIPeriods = false;
93
94	/** The maximum length of phrases */
95	private int m_MaxPhraseLength = 3;
96
97	/** The minimum length of phrases */
98	private int m_MinPhraseLength = 1;
99
100	/** The minimum number of occurences of a phrase */
101	private int m_MinNumOccur = 2;
102
103	/** The KEA filter object */
104	KEAFilter m_KEAFilter = null;
105
106	/** The stemmer to be used */
107	private Stemmer m_Stemmer = new IteratedLovinsStemmer();
108
109	/** The list of stop words to be used */
110	private Stopwords m_Stopwords = new StopwordsEnglish();
111
112	/** Determines whether check for proper nouns is performed */
113	private boolean m_CheckForProperNouns = true;
114
115	/**
116	* Get the M_CheckProperNouns value.
117	* @return the M_CheckProperNouns value.
118	*/
119	public boolean getCheckForProperNouns() {
120	return m_CheckForProperNouns;
121	}
122
123	/**
124	* Set the M_CheckProperNouns value.
125	* @param newM_CheckProperNouns The new M_CheckProperNouns value.
126	*/
127	public void setCheckForProperNouns(boolean newM_CheckProperNouns) {
128	this.m_CheckForProperNouns = newM_CheckProperNouns;
129	}
130
131	/**
132	* Get the M_Stopwords value.
133	* @return the M_Stopwords value.
134	*/
135	public Stopwords getStopwords() {
136
137	return m_Stopwords;
138	}
139
140	/**
141	* Set the M_Stopwords value.
142	* @param newM_Stopwords The new M_Stopwords value.
143	*/
144	public void setStopwords(Stopwords newM_Stopwords) {
145
146	this.m_Stopwords = newM_Stopwords;
147	}
148
149
150	/**
151	* Get the Stemmer value.
152	* @return the Stemmer value.
153	*/
154	public Stemmer getStemmer() {
155
156	return m_Stemmer;
157	}
158
159	/**
160	* Set the Stemmer value.
161	* @param newStemmer The new Stemmer value.
162	*/
163	public void setStemmer(Stemmer newStemmer) {
164
165	this.m_Stemmer = newStemmer;
166	}
167
168	/**
169	* Get the value of MinNumOccur.
170	*
171	* @return Value of MinNumOccur.
172	*/
173	public int getMinNumOccur() {
174
175	return m_MinNumOccur;
176	}
177
178	/**
179	* Set the value of MinNumOccur.
180	*
181	* @param newMinNumOccur Value to assign to MinNumOccur.
182	*/
183	public void setMinNumOccur(int newMinNumOccur) {
184
185	m_MinNumOccur = newMinNumOccur;
186	}
187
188	/**
189	* Get the value of MaxPhraseLength.
190	*
191	* @return Value of MaxPhraseLength.
192	*/
193	public int getMaxPhraseLength() {
194
195	return m_MaxPhraseLength;
196	}
197
198	/**
199	* Set the value of MaxPhraseLength.
200	*
201	* @param newMaxPhraseLength Value to assign to MaxPhraseLength.
202	*/
203	public void setMaxPhraseLength(int newMaxPhraseLength) {
204
205	m_MaxPhraseLength = newMaxPhraseLength;
206	}
207
208	/**
209	* Get the value of MinPhraseLength.
210	*
211	* @return Value of MinPhraseLength.
212	*/
213	public int getMinPhraseLength() {
214
215	return m_MinPhraseLength;
216	}
217
218	/**
219	* Set the value of MinPhraseLength.
220	*
221	* @param newMinPhraseLength Value to assign to MinPhraseLength.
222	*/
223	public void setMinPhraseLength(int newMinPhraseLength) {
224
225	m_MinPhraseLength = newMinPhraseLength;
226	}
227
228	/**
229	* Get the value of disallowIPeriods.
230	*
231	* @return Value of disallowIPeriods.
232	*/
233	public boolean getDisallowIPeriods() {
234
235	return m_disallowIPeriods;
236	}
237
238	/**
239	* Set the value of disallowIPeriods.
240	*
241	* @param newdisallowIPeriods Value to assign to disallowIPeriods.
242	*/
243	public void setDisallowIPeriods(boolean newdisallowIPeriods) {
244
245	m_disallowIPeriods = newdisallowIPeriods;
246	}
247
248	/**
249	* Get the value of useKFrequency.
250	*
251	* @return Value of useKFrequency.
252	*/
253	public boolean getUseKFrequency() {
254
255	return m_useKFrequency;
256	}
257
258	/**
259	* Set the value of useKFrequency.
260	*
261	* @param newuseKFrequency Value to assign to useKFrequency.
262	*/
263	public void setUseKFrequency(boolean newuseKFrequency) {
264
265	m_useKFrequency = newuseKFrequency;
266	}
267
268	/**
269	* Get the value of debug.
270	*
271	* @return Value of debug.
272	*/
273	public boolean getDebug() {
274
275	return m_debug;
276	}
277
278	/**
279	* Set the value of debug.
280	*
281	* @param newdebug Value to assign to debug.
282	*/
283	public void setDebug(boolean newdebug) {
284
285	m_debug = newdebug;
286	}
287
288	/**
289	* Get the value of encoding.
290	*
291	* @return Value of encoding.
292	*/
293	public String getEncoding() {
294
295	return m_encoding;
296	}
297
298	/**
299	* Set the value of encoding.
300	*
301	* @param newencoding Value to assign to encoding.
302	*/
303	public void setEncoding(String newencoding) {
304
305	m_encoding = newencoding;
306	}
307
308	/**
309	* Get the value of modelName.
310	*
311	* @return Value of modelName.
312	*/
313	public String getModelName() {
314
315	return m_modelName;
316	}
317
318	/**
319	* Set the value of modelName.
320	*
321	* @param newmodelName Value to assign to modelName.
322	*/
323	public void setModelName(String newmodelName) {
324
325	m_modelName = newmodelName;
326	}
327
328	/**
329	* Get the value of dirName.
330	*
331	* @return Value of dirName.
332	*/
333	public String getDirName() {
334
335	return m_dirName;
336	}
337
338	/**
339	* Set the value of dirName.
340	*
341	* @param newdirName Value to assign to dirName.
342	*/
343	public void setDirName(String newdirName) {
344
345	m_dirName = newdirName;
346	}
347
348	/**
349	* Parses a given list of options controlling the behaviour of this object.
350	* Valid options are:<p>
351	*
352	* -l "directory name" <br>
353	* Specifies name of directory.<p>
354	*
355	* -m "model name" <br>
356	* Specifies name of model.<p>
357	*
358	* -e "encoding" <br>
359	* Specifies encoding.<p>
360	*
361	* -d<br>
362	* Turns debugging mode on.<p>
363	*
364	* -k<br>
365	* Use keyphrase frequency statistic.<p>
366	*
367	* -p<br>
368	* Disallow internal periods. <p>
369	*
370	* -x "length"<br>
371	* Sets maximum phrase length (default: 3).<p>
372	*
373	* -y "length"<br>
374	* Sets minimum phrase length (default: 3).<p>
375	*
376	* -o "number"<br>
377	* The minimum number of times a phrase needs to occur (default: 2). <p>
378	*
379	* -s "name of class implementing list of stop words"<br>
380	* Sets list of stop words to used (default: StopwordsEnglish).<p>
381	*
382	* -t "name of class implementing stemmer"<br>
383	* Sets stemmer to use (default: IteratedLovinsStemmer). <p>
384	*
385	* -n<br>
386	* Do not check for proper nouns. <p>
387	*
388	* @param options the list of options as an array of strings
389	* @exception Exception if an option is not supported
390	*/
391	public void setOptions(String[] options) throws Exception {
392
393	String dirName = Utils.getOption('l', options);
394	if (dirName.length() > 0) {
395	setDirName(dirName);
396	} else {
397	setDirName(null);
398	throw new Exception("Name of directory required argument.");
399	}
400	String modelName = Utils.getOption('m', options);
401	if (modelName.length() > 0) {
402	setModelName(modelName);
403	} else {
404	setModelName(null);
405	throw new Exception("Name of model required argument.");
406	}
407	String encoding = Utils.getOption('e', options);
408	if (encoding.length() > 0) {
409	setEncoding(encoding);
410	} else {
411	setEncoding("default");
412	}
413	String maxPhraseLengthString = Utils.getOption('x', options);
414	if (maxPhraseLengthString.length() > 0) {
415	setMaxPhraseLength(Integer.parseInt(maxPhraseLengthString));
416	} else {
417	setMaxPhraseLength(3);
418	}
419	String minPhraseLengthString = Utils.getOption('y', options);
420	if (minPhraseLengthString.length() > 0) {
421	setMinPhraseLength(Integer.parseInt(minPhraseLengthString));
422	} else {
423	setMinPhraseLength(1);
424	}
425	String minNumOccurString = Utils.getOption('o', options);
426	if (minNumOccurString.length() > 0) {
427	setMinNumOccur(Integer.parseInt(minNumOccurString));
428	} else {
429	setMinNumOccur(2);
430	}
431	String stopwordsString = Utils.getOption('s', options);
432	if (stopwordsString.length() > 0) {
433	setStopwords((Stopwords)Class.forName(stopwordsString).newInstance());
434	}
435	String stemmerString = Utils.getOption('t', options);
436	if (stemmerString.length() > 0) {
437	setStemmer((Stemmer)Class.forName(stemmerString).newInstance());
438	}
439	setDebug(Utils.getFlag('d', options));
440	setUseKFrequency(Utils.getFlag('k', options));
441	setDisallowIPeriods(Utils.getFlag('p', options));
442	setCheckForProperNouns(!Utils.getFlag('n', options));
443	Utils.checkForRemainingOptions(options);
444	}
445
446	/**
447	* Gets the current option settings.
448	*
449	* @return an array of strings suitable for passing to setOptions
450	*/
451	public String [] getOptions() {
452
453	String [] options = new String [20];
454	int current = 0;
455
456	options[current++] = "-l";
457	options[current++] = "" + (getDirName());
458	options[current++] = "-m";
459	options[current++] = "" + (getModelName());
460	options[current++] = "-e";
461	options[current++] = "" + (getEncoding());
462	if (getUseKFrequency()) {
463	options[current++] = "-k";
464	}
465	if (getDebug()) {
466	options[current++] = "-d";
467	}
468	if (getDisallowIPeriods()) {
469	options[current++] = "-p";
470	}
471	options[current++] = "-x";
472	options[current++] = "" + (getMaxPhraseLength());
473	options[current++] = "-y";
474	options[current++] = "" + (getMinPhraseLength());
475	options[current++] = "-o";
476	options[current++] = "" + (getMinNumOccur());
477	options[current++] = "-s";
478	options[current++] = "" + (getStopwords().getClass().getName());
479	options[current++] = "-t";
480	options[current++] = "" + (getStemmer().getClass().getName());
481	if (getCheckForProperNouns()) {
482	options[current++] = "-n";
483	}
484
485	while (current < options.length) {
486	options[current++] = "";
487	}
488	return options;
489	}
490
491	/**
492	* Returns an enumeration describing the available options.
493	*
494	* @return an enumeration of all the available options
495	*/
496	public Enumeration listOptions() {
497
498	Vector newVector = new Vector(12);
499
500	newVector.addElement(new Option(
501	"\tSpecifies name of directory.",
502	"l", 1, "-l <directory name>"));
503	newVector.addElement(new Option(
504	"\tSpecifies name of model.",
505	"m", 1, "-m <model name>"));
506	newVector.addElement(new Option(
507	"\tSpecifies encoding.",
508	"e", 1, "-e <encoding>"));
509	newVector.addElement(new Option(
510	"\tTurns debugging mode on.",
511	"d", 0, "-d"));
512	newVector.addElement(new Option(
513	"\tUse keyphrase frequency statistic.",
514	"k", 0, "-k"));
515	newVector.addElement(new Option(
516	"\tDisallow internal periods.",
517	"p", 0, "-p"));
518	newVector.addElement(new Option(
519	"\tSets the maximum phrase length (default: 3).",
520	"x", 1, "-x <length>"));
521	newVector.addElement(new Option(
522	"\tSets the minimum phrase length (default: 1).",
523	"y", 1, "-y <length>"));
524	newVector.addElement(new Option(
525	"\tSet the minimum number of occurences (default: 2).",
526	"o", 1, "-o"));
527	newVector.addElement(new Option(
528	"\tSets the list of stopwords to use (default: StopwordsEnglish).",
529	"s", 1, "-s <name of stopwords class>"));
530	newVector.addElement(new Option(
531	"\tSet the stemmer to use (default: IteratedLovinsStemmer).",
532	"t", 1, "-t <name of stemmer class>"));
533	newVector.addElement(new Option(
534	"\tDo not check for proper nouns.",
535	"n", 0, "-n"));
536
537	return newVector.elements();
538	}
539
540	/**
541	* Collects the stems of the file names.
542	*/
543	public Hashtable collectStems() throws Exception {
544
545	Hashtable stems = new Hashtable();
546
547	try {
548	File dir = new File(m_dirName);
549	String[] files = dir.list();
550	for (int i = 0; i < files.length; i++) {
551	if (files[i].endsWith(".key") \|\|
552	files[i].endsWith(".txt")) {
553	String stem = files[i].substring(0, files[i].length() - 4);
554	if (!stems.containsKey(stem)) {
555	stems.put(stem, new Double(0));
556	}
557	}
558	}
559	} catch (Exception e) {
560	throw new Exception("Problem opening directory " + m_dirName);
561	}
562	return stems;
563	}
564
565	/**
566	* Builds the model from the files
567	*/
568	public void buildModel(Hashtable stems) throws Exception {
569
570	// Check whether there is actually any data
571	if (stems.size() == 0) {
572	throw new Exception("Couldn't find any data!");
573	}
574
575	FastVector atts = new FastVector(2);
576	atts.addElement(new Attribute("doc", null));
577	atts.addElement(new Attribute("keyphrases", null));
578	Instances data = new Instances("keyphrase_training_data", atts, 0);
579
580	// Build model
581	m_KEAFilter = new KEAFilter();
582	m_KEAFilter.setDebug(m_debug);
583	m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
584	m_KEAFilter.setKFused(getUseKFrequency());
585	m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
586	m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
587	m_KEAFilter.setMinNumOccur(getMinNumOccur());
588	m_KEAFilter.setInputFormat(data);
589	m_KEAFilter.setStemmer(getStemmer());
590	m_KEAFilter.setStopwords(getStopwords());
591	m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
592	Enumeration elem = stems.keys();
593	while (elem.hasMoreElements()) {
594	String str = (String)elem.nextElement();
595	double[] newInst = new double[2];
596	try {
597	File txt = new File(m_dirName + "/" + str + ".txt");
598	InputStreamReader is;
599	if (!m_encoding.equals("default")) {
600	is = new InputStreamReader(new FileInputStream(txt), m_encoding);
601	} else {
602	is = new InputStreamReader(new FileInputStream(txt));
603	}
604	StringBuffer txtStr = new StringBuffer();
605	int c;
606	while ((c = is.read()) != -1) {
607	txtStr.append((char)c);
608	}
609	newInst[0] = (double)data.attribute(0).addStringValue(txtStr.toString());
610	} catch (Exception e) {
611	if (m_debug) {
612	System.err.println("Can't find document for stem " + str + ".");
613	}
614	newInst[0] = Instance.missingValue();
615	}
616	try {
617	File key = new File(m_dirName + "/" + str + ".key");
618	InputStreamReader is;
619	if (!m_encoding.equals("default")) {
620	is = new InputStreamReader(new FileInputStream(key), m_encoding);
621	} else {
622	is = new InputStreamReader(new FileInputStream(key));
623	}
624	StringBuffer keyStr = new StringBuffer();
625	int c;
626	while ((c = is.read()) != -1) {
627	keyStr.append((char)c);
628	}
629	newInst[1] = (double)data.attribute(1).addStringValue(keyStr.toString());
630	} catch (Exception e) {
631	if (m_debug) {
632	System.err.println("Can't find keyphrases for stem " + str + ".");
633	}
634	newInst[1] = Instance.missingValue();
635	}
636	data.add(new Instance(1.0, newInst));
637	m_KEAFilter.input(data.instance(0));
638	data = data.stringFreeStructure();
639	}
640	m_KEAFilter.batchFinished();
641
642	// Get rid of instances in filter
643	Instance dummy;
644	while ((dummy = m_KEAFilter.output()) != null) {};
645	}
646
647	/**
648	* Saves the extraction model to the file.
649	*/
650	public void saveModel() throws Exception {
651
652	BufferedOutputStream bufferedOut =
653	new BufferedOutputStream(new FileOutputStream(m_modelName));
654	ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
655	out.writeObject(m_KEAFilter);
656	out.flush();
657	out.close();
658	}
659
660	/**
661	* The main method.
662	*/
663	public static void main(String[] ops) {
664
665	KEAModelBuilder kmb = new KEAModelBuilder();
666	try {
667	kmb.setOptions(ops);
668	System.err.print("Building model with options: ");
669	String[] optionSettings = kmb.getOptions();
670	for (int i = 0; i < optionSettings.length; i++) {
671	System.err.print(optionSettings[i] + " ");
672	}
673	System.err.println();
674	kmb.buildModel(kmb.collectStems());
675	kmb.saveModel();
676	} catch (Exception e) {
677	e.printStackTrace();
678	System.err.println(e.getMessage());
679	System.err.println("\nOptions:\n");
680	Enumeration enum = kmb.listOptions();
681	while (enum.hasMoreElements()) {
682	Option option = (Option) enum.nextElement();
683	System.err.println(option.synopsis());
684	System.err.println(option.description());
685	}
686	}
687	}
688	}
689

Note: See TracBrowser for help on using the repository browser.

Download in other formats: