Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/weka/filters/DiscretizeFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 35.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* DiscretizeFilter.java
19	* Copyright (C) 1999 Eibe Frank,Len Trigg
20	*
21	*/
22
23
24	package weka.filters;
25
26	import java.io.*;
27	import java.util.*;
28	import weka.core.*;
29
30	/**
31	* An instance filter that discretizes a range of numeric attributes in
32	* the dataset into nominal attributes. Discretization can be either by
33	* simple binning, or by Fayyad & Irani's MDL method (the default).<p>
34	*
35	* Valid filter-specific options are: <p>
36	*
37	* -B num <br>
38	* Specifies the (maximum) number of bins to divide numeric attributes into.
39	* (default: class-based discretisation).<p>
40	*
41	* -F <br>
42	* Use equal-frequency instead of equal-width discretization if
43	* class-based discretisation is turned off.<p>
44	*
45	* -O <br>
46	* Optimize the number of bins using a leave-one-out estimate of the
47	* entropy (for equal-width binning).<p>
48	*
49	* -R col1,col2-col4,... <br>
50	* Specifies list of columns to Discretize. First
51	* and last are valid indexes. (default: none) <p>
52	*
53	* -V <br>
54	* Invert matching sense.<p>
55	*
56	* -D <br>
57	* Make binary nominal attributes. <p>
58	*
59	* -E <br>
60	* Use better encoding of split point for MDL. <p>
61	*
62	* -K <br>
63	* Use Kononeko's MDL criterion. <p>
64	*
65	* @author Len Trigg ([email protected])
66	* @author Eibe Frank ([email protected])
67	* @version $Revision: 8815 $
68	*/
69	public class DiscretizeFilter extends Filter
70	implements OptionHandler, WeightedInstancesHandler {
71
72	/** Stores which columns to Discretize */
73	protected Range m_DiscretizeCols = new Range();
74
75	/** The number of bins to divide the attribute into */
76	protected int m_NumBins = 10;
77
78	/** Store the current cutpoints */
79	protected double [][] m_CutPoints = null;
80
81	/** True if discretisation will be done by MDL rather than binning */
82	protected boolean m_UseMDL = true;
83
84	/** Output binary attributes for discretized attributes. */
85	protected boolean m_MakeBinary = false;
86
87	/** Use better encoding of split point for MDL. */
88	protected boolean m_UseBetterEncoding = false;
89
90	/** Use Kononenko's MDL criterion instead of Fayyad et al.'s */
91	protected boolean m_UseKononenko = false;
92
93	/** Find the number of bins using cross-validated entropy. */
94	protected boolean m_FindNumBins = false;
95
96	/** Use equal-frequency binning if unsupervised discretization turned on */
97	protected boolean m_UseEqualFrequency = false;
98
99	/** Constructor - initialises the filter */
100	public DiscretizeFilter() {
101
102	setAttributeIndices("first-last");
103	}
104
105
106	/**
107	* Gets an enumeration describing the available options
108	*
109	* @return an enumeration of all the available options
110	*/
111	public Enumeration listOptions() {
112
113	Vector newVector = new Vector(7);
114
115	newVector.addElement(new Option(
116	"\tSpecifies the (maximum) number of bins to divide numeric"
117	+ " attributes into.\n"
118	+ "\t(default class-based discretization)",
119	"B", 1, "-B <num>"));
120
121	newVector.addElement(new Option(
122	"\tUse equal-frequency instead of equal-width with\n"+
123	"\tunsupervised discretization.",
124	"F", 0, "-F"));
125
126	newVector.addElement(new Option(
127	"\tOptimize number of bins using leave-one-out estimate\n"+
128	"\tof estimated entropy (for equal-width discretization).",
129	"O", 0, "-O"));
130
131	/* If we decide to implement loading and saving cutfiles like
132	* the C Discretizer (which is probably not necessary)
133	newVector.addElement(new Option(
134	"\tSpecify that the cutpoints should be loaded from a file.",
135	"L", 1, "-L <file>"));
136	newVector.addElement(new Option(
137	"\tSpecify that the chosen cutpoints should be saved to a file.",
138	"S", 1, "-S <file>"));
139	*/
140
141	newVector.addElement(new Option(
142	"\tSpecifies list of columns to Discretize. First"
143	+ " and last are valid indexes.\n"
144	+ "\t(default none)",
145	"R", 1, "-R <col1,col2-col4,...>"));
146
147	newVector.addElement(new Option(
148	"\tInvert matching sense of column indexes.",
149	"V", 0, "-V"));
150
151	newVector.addElement(new Option(
152	"\tOutput binary attributes for discretized attributes.",
153	"D", 0, "-D"));
154
155	newVector.addElement(new Option(
156	"\tUse better encoding of split point for MDL.",
157	"E", 0, "-E"));
158
159	newVector.addElement(new Option(
160	"\tUse Kononenko's MDL criterion.",
161	"K", 0, "-K"));
162
163	return newVector.elements();
164	}
165
166
167	/**
168	* Parses the options for this object. Valid options are: <p>
169	*
170	* -B num <br>
171	* Specifies the (maximum) number of bins to divide numeric attributes into.
172	* (default class-based discretisation).<p>
173	*
174	* -F <br>
175	* Use equal-frequency instead of equal-width discretization if
176	* class-based discretisation is turned off.<p>
177	*
178	* -O <br>
179	* Optimize the number of bins using a leave-one-out estimate of the
180	* entropy (for equal-width binning).<p>
181	*
182	* -R col1,col2-col4,... <br>
183	* Specifies list of columns to Discretize. First
184	* and last are valid indexes. (default none) <p>
185	*
186	* -V <br>
187	* Invert matching sense.<p>
188	*
189	* -D <br>
190	* Make binary nominal attributes. <p>
191	*
192	* -E <br>
193	* Use better encoding of split point for MDL. <p>
194	*
195	* -K <br>
196	* Use Kononeko's MDL criterion. <p>
197	*
198	* @param options the list of options as an array of strings
199	* @exception Exception if an option is not supported
200	*/
201	public void setOptions(String[] options) throws Exception {
202
203	setMakeBinary(Utils.getFlag('D', options));
204	setUseEqualFrequency(Utils.getFlag('F', options));
205	setUseBetterEncoding(Utils.getFlag('E', options));
206	setUseKononenko(Utils.getFlag('K', options));
207	setFindNumBins(Utils.getFlag('O', options));
208	setInvertSelection(Utils.getFlag('V', options));
209	setUseMDL(true);
210
211	String numBins = Utils.getOption('B', options);
212	if (numBins.length() != 0) {
213	setBins(Integer.parseInt(numBins));
214	setUseMDL(false);
215	} else {
216	setBins(10);
217	}
218
219	String convertList = Utils.getOption('R', options);
220	if (convertList.length() != 0) {
221	setAttributeIndices(convertList);
222	} else {
223	setAttributeIndices("first-last");
224	}
225
226	if (getInputFormat() != null) {
227	setInputFormat(getInputFormat());
228	}
229	}
230	/**
231	* Gets the current settings of the filter.
232	*
233	* @return an array of strings suitable for passing to setOptions
234	*/
235	public String [] getOptions() {
236
237	String [] options = new String [12];
238	int current = 0;
239
240	if (getMakeBinary()) {
241	options[current++] = "-D";
242	}
243	if (getUseEqualFrequency()) {
244	options[current++] = "-F";
245	}
246	if (getUseBetterEncoding()) {
247	options[current++] = "-E";
248	}
249	if (getUseKononenko()) {
250	options[current++] = "-K";
251	}
252	if (getFindNumBins()) {
253	options[current++] = "-O";
254	}
255	if (getInvertSelection()) {
256	options[current++] = "-V";
257	}
258	if (!getUseMDL()) {
259	options[current++] = "-B"; options[current++] = "" + getBins();
260	}
261	if (!getAttributeIndices().equals("")) {
262	options[current++] = "-R"; options[current++] = getAttributeIndices();
263	}
264	while (current < options.length) {
265	options[current++] = "";
266	}
267	return options;
268	}
269
270
271	/**
272	* Sets the format of the input instances.
273	*
274	* @param instanceInfo an Instances object containing the input instance
275	* structure (any instances contained in the object are ignored - only the
276	* structure is required).
277	* @return true if the outputFormat may be collected immediately
278	* @exception Exception if the input format can't be set successfully
279	*/
280	public boolean setInputFormat(Instances instanceInfo) throws Exception {
281
282	super.setInputFormat(instanceInfo);
283
284	m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
285	m_CutPoints = null;
286	if (m_UseMDL) {
287	if (instanceInfo.classIndex() < 0) {
288	throw new UnassignedClassException("Cannot use class-based discretization: "
289	+ "no class assigned to the dataset");
290	}
291	if (!instanceInfo.classAttribute().isNominal()) {
292	throw new UnsupportedClassTypeException("Supervised discretization not possible:"
293	+ " class is not nominal!");
294	}
295	} else {
296	if (getFindNumBins() && getUseEqualFrequency()) {
297	throw new IllegalArgumentException("Bin number optimization in conjunction "+
298	"with equal-frequency binning not implemented.");
299	}
300	}
301
302	// If we implement loading cutfiles, then load
303	//them here and set the output format
304	return false;
305	}
306
307
308
309	/**
310	* Input an instance for filtering. Ordinarily the instance is processed
311	* and made available for output immediately. Some filters require all
312	* instances be read before producing output.
313	*
314	* @param instance the input instance
315	* @return true if the filtered instance may now be
316	* collected with output().
317	* @exception IllegalStateException if no input format has been defined.
318	*/
319	public boolean input(Instance instance) {
320
321	if (getInputFormat() == null) {
322	throw new IllegalStateException("No input instance format defined");
323	}
324	if (m_NewBatch) {
325	resetQueue();
326	m_NewBatch = false;
327	}
328
329	if (m_CutPoints != null) {
330	convertInstance(instance);
331	return true;
332	}
333
334	bufferInput(instance);
335	return false;
336	}
337
338
339	/**
340	* Signifies that this batch of input to the filter is finished. If the
341	* filter requires all instances prior to filtering, output() may now
342	* be called to retrieve the filtered instances.
343	*
344	* @return true if there are instances pending output
345	* @exception IllegalStateException if no input structure has been defined
346	*/
347	public boolean batchFinished() {
348
349	if (getInputFormat() == null) {
350	throw new IllegalStateException("No input instance format defined");
351	}
352	if (m_CutPoints == null) {
353	calculateCutPoints();
354
355	setOutputFormat();
356
357	// If we implement saving cutfiles, save the cuts here
358
359	// Convert pending input instances
360	for(int i = 0; i < getInputFormat().numInstances(); i++) {
361	convertInstance(getInputFormat().instance(i));
362	}
363	}
364	flushInput();
365
366	m_NewBatch = true;
367	return (numPendingOutput() != 0);
368	}
369
370	/**
371	* Returns a string describing this filter
372	*
373	* @return a description of the filter suitable for
374	* displaying in the explorer/experimenter gui
375	*/
376	public String globalInfo() {
377
378	return "An instance filter that discretizes a range of numeric"
379	+ " attributes in the dataset into nominal attributes."
380	+ " Discretization can be either by simple binning, or by"
381	+ " Fayyad & Irani's MDL method (the default).";
382	}
383
384	/**
385	* Returns the tip text for this property
386	*
387	* @return tip text for this property suitable for
388	* displaying in the explorer/experimenter gui
389	*/
390	public String findNumBinsTipText() {
391
392	return "Optimize number of equal-width bins using leave-one-out.";
393	}
394
395	/**
396	* Get the value of FindNumBins.
397	*
398	* @return Value of FindNumBins.
399	*/
400	public boolean getFindNumBins() {
401
402	return m_FindNumBins;
403	}
404
405	/**
406	* Set the value of FindNumBins.
407	*
408	* @param newFindNumBins Value to assign to FindNumBins.
409	*/
410	public void setFindNumBins(boolean newFindNumBins) {
411
412	m_FindNumBins = newFindNumBins;
413	}
414
415	/**
416	* Returns the tip text for this property
417	*
418	* @return tip text for this property suitable for
419	* displaying in the explorer/experimenter gui
420	*/
421	public String makeBinaryTipText() {
422
423	return "Make resulting attributes binary.";
424	}
425
426	/**
427	* Gets whether binary attributes should be made for discretized ones.
428	*
429	* @return true if attributes will be binarized
430	*/
431	public boolean getMakeBinary() {
432
433	return m_MakeBinary;
434	}
435
436	/**
437	* Sets whether binary attributes should be made for discretized ones.
438	*
439	* @param makeBinary if binary attributes are to be made
440	*/
441	public void setMakeBinary(boolean makeBinary) {
442
443	m_MakeBinary = makeBinary;
444	}
445
446	/**
447	* Returns the tip text for this property
448	*
449	* @return tip text for this property suitable for
450	* displaying in the explorer/experimenter gui
451	*/
452	public String useMDLTipText() {
453
454	return "Use class-based discretization. If set to false, does"
455	+ " not require a class attribute, and uses a fixed number"
456	+ " of bins (according to bins setting).";
457	}
458
459	/**
460	* Gets whether MDL will be used as the discretisation method.
461	*
462	* @return true if so, false if fixed bins should be used.
463	*/
464	public boolean getUseMDL() {
465
466	return m_UseMDL;
467	}
468
469	/**
470	* Sets whether MDL will be used as the discretisation method.
471	*
472	* @param useMDL true if MDL should be used, false if fixed bins should
473	* be used.
474	*/
475	public void setUseMDL(boolean useMDL) {
476
477	m_UseMDL = useMDL;
478	}
479
480	/**
481	* Returns the tip text for this property
482	*
483	* @return tip text for this property suitable for
484	* displaying in the explorer/experimenter gui
485	*/
486	public String useKononenkoTipText() {
487
488	return "Use Kononenko's MDL criterion. If set to false"
489	+ " uses the Fayyad & Irani criterion.";
490	}
491
492	/**
493	* Get the value of UseEqualFrequency.
494	*
495	* @return Value of UseEqualFrequency.
496	*/
497	public boolean getUseEqualFrequency() {
498
499	return m_UseEqualFrequency;
500	}
501
502	/**
503	* Set the value of UseEqualFrequency.
504	*
505	* @param newUseEqualFrequency Value to assign to UseEqualFrequency.
506	*/
507	public void setUseEqualFrequency(boolean newUseEqualFrequency) {
508
509	m_UseEqualFrequency = newUseEqualFrequency;
510	}
511
512	/**
513	* Gets whether Kononenko's MDL criterion is to be used.
514	*
515	* @return true if Kononenko's criterion will be used.
516	*/
517	public boolean getUseKononenko() {
518
519	return m_UseKononenko;
520	}
521
522	/**
523	* Sets whether Kononenko's MDL criterion is to be used.
524	*
525	* @param useKon true if Kononenko's one is to be used
526	*/
527	public void setUseKononenko(boolean useKon) {
528
529	m_UseKononenko = useKon;
530	}
531
532	/**
533	* Returns the tip text for this property
534	*
535	* @return tip text for this property suitable for
536	* displaying in the explorer/experimenter gui
537	*/
538	public String useBetterEncodingTipText() {
539
540	return "Uses a more efficient split point encoding.";
541	}
542
543	/**
544	* Gets whether better encoding is to be used for MDL.
545	*
546	* @return true if the better MDL encoding will be used
547	*/
548	public boolean getUseBetterEncoding() {
549
550	return m_UseBetterEncoding;
551	}
552
553	/**
554	* Sets whether better encoding is to be used for MDL.
555	*
556	* @param useBetterEncoding true if better encoding to be used.
557	*/
558	public void setUseBetterEncoding(boolean useBetterEncoding) {
559
560	m_UseBetterEncoding = useBetterEncoding;
561	}
562
563	/**
564	* Returns the tip text for this property
565	*
566	* @return tip text for this property suitable for
567	* displaying in the explorer/experimenter gui
568	*/
569	public String binsTipText() {
570
571	return "Number of bins for class-blind discretisation. This"
572	+ " setting is ignored if MDL-based discretisation is used.";
573	}
574
575	/**
576	* Gets the number of bins numeric attributes will be divided into
577	*
578	* @return the number of bins.
579	*/
580	public int getBins() {
581
582	return m_NumBins;
583	}
584
585	/**
586	* Sets the number of bins to divide each selected numeric attribute into
587	*
588	* @param numBins the number of bins
589	*/
590	public void setBins(int numBins) {
591
592	m_NumBins = numBins;
593	}
594
595	/**
596	* Returns the tip text for this property
597	*
598	* @return tip text for this property suitable for
599	* displaying in the explorer/experimenter gui
600	*/
601	public String invertSelectionTipText() {
602
603	return "Set attribute selection mode. If false, only selected"
604	+ " (numeric) attributes in the range will be discretized; if"
605	+ " true, only non-selected attributes will be discretized.";
606	}
607
608	/**
609	* Gets whether the supplied columns are to be removed or kept
610	*
611	* @return true if the supplied columns will be kept
612	*/
613	public boolean getInvertSelection() {
614
615	return m_DiscretizeCols.getInvert();
616	}
617
618	/**
619	* Sets whether selected columns should be removed or kept. If true the
620	* selected columns are kept and unselected columns are deleted. If false
621	* selected columns are deleted and unselected columns are kept.
622	*
623	* @param invert the new invert setting
624	*/
625	public void setInvertSelection(boolean invert) {
626
627	m_DiscretizeCols.setInvert(invert);
628	}
629
630	/**
631	* Returns the tip text for this property
632	*
633	* @return tip text for this property suitable for
634	* displaying in the explorer/experimenter gui
635	*/
636	public String attributeIndicesTipText() {
637	return "Specify range of attributes to act on."
638	+ " This is a comma separated list of attribute indices, with"
639	+ " \"first\" and \"last\" valid values. Specify an inclusive"
640	+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
641	}
642
643	/**
644	* Gets the current range selection
645	*
646	* @return a string containing a comma separated list of ranges
647	*/
648	public String getAttributeIndices() {
649
650	return m_DiscretizeCols.getRanges();
651	}
652
653	/**
654	* Sets which attributes are to be Discretized (only numeric
655	* attributes among the selection will be Discretized).
656	*
657	* @param rangeList a string representing the list of attributes. Since
658	* the string will typically come from a user, attributes are indexed from
659	* 1. <br>
660	* eg: first-3,5,6-last
661	* @exception IllegalArgumentException if an invalid range list is supplied
662	*/
663	public void setAttributeIndices(String rangeList) {
664
665	m_DiscretizeCols.setRanges(rangeList);
666	}
667
668	/**
669	* Sets which attributes are to be Discretized (only numeric
670	* attributes among the selection will be Discretized).
671	*
672	* @param attributes an array containing indexes of attributes to Discretize.
673	* Since the array will typically come from a program, attributes are indexed
674	* from 0.
675	* @exception IllegalArgumentException if an invalid set of ranges
676	* is supplied
677	*/
678	public void setAttributeIndicesArray(int [] attributes) {
679
680	setAttributeIndices(Range.indicesToRangeList(attributes));
681	}
682
683	/**
684	* Gets the cut points for an attribute
685	*
686	* @param the index (from 0) of the attribute to get the cut points of
687	* @return an array containing the cutpoints (or null if the
688	* attribute requested isn't being Discretized
689	*/
690	public double [] getCutPoints(int attributeIndex) {
691
692	if (m_CutPoints == null) {
693	return null;
694	}
695	return m_CutPoints[attributeIndex];
696	}
697
698	/** Generate the cutpoints for each attribute */
699	protected void calculateCutPoints() {
700
701	Instances copy = null;
702
703	m_CutPoints = new double [getInputFormat().numAttributes()] [];
704	for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) {
705	if ((m_DiscretizeCols.isInRange(i)) &&
706	(getInputFormat().attribute(i).isNumeric())) {
707	if (m_UseMDL) {
708
709	// Use copy to preserve order
710	if (copy == null) {
711	copy = new Instances(getInputFormat());
712	}
713	calculateCutPointsByMDL(i, copy);
714	} else {
715	if (m_FindNumBins) {
716	findNumBins(i);
717	} else if (!m_UseEqualFrequency) {
718	calculateCutPointsByEqualWidthBinning(i);
719	} else {
720	calculateCutPointsByEqualFrequencyBinning(i);
721	}
722	}
723	}
724	}
725	}
726
727	/**
728	* Set cutpoints for a single attribute using MDL.
729	*
730	* @param index the index of the attribute to set cutpoints for
731	*/
732	protected void calculateCutPointsByMDL(int index,
733	Instances data) {
734
735	// Sort instances
736	data.sort(data.attribute(index));
737
738	// Find first instances that's missing
739	int firstMissing = data.numInstances();
740	for (int i = 0; i < data.numInstances(); i++) {
741	if (data.instance(i).isMissing(index)) {
742	firstMissing = i;
743	break;
744	}
745	}
746	m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
747	}
748
749	/** Test using Kononenko's MDL criterion. */
750	private boolean KononenkosMDL(double[] priorCounts,
751	double[][] bestCounts,
752	double numInstances,
753	int numCutPoints) {
754
755	double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
756	double before, after;
757	int numClassesTotal;
758
759	// Number of classes occuring in the set
760	numClassesTotal = 0;
761	for (int i = 0; i < priorCounts.length; i++) {
762	if (priorCounts[i] > 0) {
763	numClassesTotal++;
764	}
765	}
766
767	// Encode distribution prior to split
768	distPrior = SpecialFunctions.log2Binomial(numInstances
769	+ numClassesTotal - 1,
770	numClassesTotal - 1);
771
772	// Encode instances prior to split.
773	instPrior = SpecialFunctions.log2Multinomial(numInstances,
774	priorCounts);
775
776	before = instPrior + distPrior;
777
778	// Encode distributions and instances after split.
779	for (int i = 0; i < bestCounts.length; i++) {
780	sum = Utils.sum(bestCounts[i]);
781	distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1,
782	numClassesTotal - 1);
783	instAfter += SpecialFunctions.log2Multinomial(sum,
784	bestCounts[i]);
785	}
786
787	// Coding cost after split
788	after = Utils.log2(numCutPoints) + distAfter + instAfter;
789
790	// Check if split is to be accepted
791	return (Utils.gr(before, after));
792	}
793
794
795	/** Test using Fayyad and Irani's MDL criterion. */
796	private boolean FayyadAndIranisMDL(double[] priorCounts,
797	double[][] bestCounts,
798	double numInstances,
799	int numCutPoints) {
800
801	double priorEntropy, entropy, gain;
802	double entropyLeft, entropyRight, delta;
803	int numClassesTotal, numClassesRight, numClassesLeft;
804
805	// Compute entropy before split.
806	priorEntropy = ContingencyTables.entropy(priorCounts);
807
808	// Compute entropy after split.
809	entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);
810
811	// Compute information gain.
812	gain = priorEntropy - entropy;
813
814	// Number of classes occuring in the set
815	numClassesTotal = 0;
816	for (int i = 0; i < priorCounts.length; i++) {
817	if (priorCounts[i] > 0) {
818	numClassesTotal++;
819	}
820	}
821
822	// Number of classes occuring in the left subset
823	numClassesLeft = 0;
824	for (int i = 0; i < bestCounts[0].length; i++) {
825	if (bestCounts[0][i] > 0) {
826	numClassesLeft++;
827	}
828	}
829
830	// Number of classes occuring in the right subset
831	numClassesRight = 0;
832	for (int i = 0; i < bestCounts[1].length; i++) {
833	if (bestCounts[1][i] > 0) {
834	numClassesRight++;
835	}
836	}
837
838	// Entropy of the left and the right subsets
839	entropyLeft = ContingencyTables.entropy(bestCounts[0]);
840	entropyRight = ContingencyTables.entropy(bestCounts[1]);
841
842	// Compute terms for MDL formula
843	delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) -
844	(((double) numClassesTotal * priorEntropy) -
845	(numClassesRight * entropyRight) -
846	(numClassesLeft * entropyLeft));
847
848	// Check if split is to be accepted
849	return (Utils.gr(gain, (Utils.log2(numCutPoints) + delta) /
850	(double)numInstances));
851	}
852
853
854	/** Selects cutpoints for sorted subset. */
855	private double[] cutPointsForSubset(Instances instances, int attIndex,
856	int first, int lastPlusOne) {
857
858	double[][] counts, bestCounts;
859	double[] priorCounts, left, right, cutPoints;
860	double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1,
861	currentEntropy, bestEntropy, priorEntropy, gain;
862	int bestIndex = -1, numInstances = 0, numCutPoints = 0;
863
864	// Compute number of instances in set
865	if ((lastPlusOne - first) < 2) {
866	return null;
867	}
868
869	// Compute class counts.
870	counts = new double[2][instances.numClasses()];
871	for (int i = first; i < lastPlusOne; i++) {
872	numInstances += instances.instance(i).weight();
873	counts[1][(int)instances.instance(i).classValue()] +=
874	instances.instance(i).weight();
875	}
876
877	// Save prior counts
878	priorCounts = new double[instances.numClasses()];
879	System.arraycopy(counts[1], 0, priorCounts, 0,
880	instances.numClasses());
881
882	// Entropy of the full set
883	priorEntropy = ContingencyTables.entropy(priorCounts);
884	bestEntropy = priorEntropy;
885
886	// Find best entropy.
887	bestCounts = new double[2][instances.numClasses()];
888	for (int i = first; i < (lastPlusOne - 1); i++) {
889	counts[0][(int)instances.instance(i).classValue()] +=
890	instances.instance(i).weight();
891	counts[1][(int)instances.instance(i).classValue()] -=
892	instances.instance(i).weight();
893	if (Utils.sm(instances.instance(i).value(attIndex),
894	instances.instance(i + 1).value(attIndex))) {
895	currentCutPoint = instances.instance(i).value(attIndex); //+
896	//instances.instance(i + 1).value(attIndex)) / 2.0;
897	currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
898	if (Utils.sm(currentEntropy, bestEntropy)) {
899	bestCutPoint = currentCutPoint;
900	bestEntropy = currentEntropy;
901	bestIndex = i;
902	System.arraycopy(counts[0], 0,
903	bestCounts[0], 0, instances.numClasses());
904	System.arraycopy(counts[1], 0,
905	bestCounts[1], 0, instances.numClasses());
906	}
907	numCutPoints++;
908	}
909	}
910
911	// Use worse encoding?
912	if (!m_UseBetterEncoding) {
913	numCutPoints = (lastPlusOne - first) - 1;
914	}
915
916	// Checks if gain is zero
917	gain = priorEntropy - bestEntropy;
918	if (Utils.eq(gain, 0)) {
919	return null;
920	}
921
922	// Check if split is to be accepted
923	if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts,
924	numInstances, numCutPoints)) \|\|
925	(!m_UseKononenko && FayyadAndIranisMDL(priorCounts, bestCounts,
926	numInstances, numCutPoints))) {
927
928	// Select split points for the left and right subsets
929	left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1);
930	right = cutPointsForSubset(instances, attIndex,
931	bestIndex + 1, lastPlusOne);
932
933	// Merge cutpoints and return them
934	if ((left == null) && (right) == null) {
935	cutPoints = new double[1];
936	cutPoints[0] = bestCutPoint;
937	} else if (right == null) {
938	cutPoints = new double[left.length + 1];
939	System.arraycopy(left, 0, cutPoints, 0, left.length);
940	cutPoints[left.length] = bestCutPoint;
941	} else if (left == null) {
942	cutPoints = new double[1 + right.length];
943	cutPoints[0] = bestCutPoint;
944	System.arraycopy(right, 0, cutPoints, 1, right.length);
945	} else {
946	cutPoints = new double[left.length + right.length + 1];
947	cutPoints = new double[left.length + right.length + 1];
948	System.arraycopy(left, 0, cutPoints, 0, left.length);
949	cutPoints[left.length] = bestCutPoint;
950	System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
951	}
952
953	return cutPoints;
954	} else
955	return null;
956	}
957
958	/**
959	* Set cutpoints for a single attribute.
960	*
961	* @param index the index of the attribute to set cutpoints for
962	*/
963	protected void calculateCutPointsByEqualWidthBinning(int index) {
964
965	// Scan for max and min values
966	double max = 0, min = 1, currentVal;
967	Instance currentInstance;
968	for(int i = 0; i < getInputFormat().numInstances(); i++) {
969	currentInstance = getInputFormat().instance(i);
970	if (!currentInstance.isMissing(index)) {
971	currentVal = currentInstance.value(index);
972	if (max < min) {
973	max = min = currentVal;
974	}
975	if (currentVal > max) {
976	max = currentVal;
977	}
978	if (currentVal < min) {
979	min = currentVal;
980	}
981	}
982	}
983	double binWidth = (max - min) / m_NumBins;
984	double [] cutPoints = null;
985	if ((m_NumBins > 1) && (binWidth > 0)) {
986	cutPoints = new double [m_NumBins - 1];
987	for(int i = 1; i < m_NumBins; i++) {
988	cutPoints[i - 1] = min + binWidth * i;
989	}
990	}
991	m_CutPoints[index] = cutPoints;
992	}
993
994	/**
995	* Set cutpoints for a single attribute.
996	*
997	* @param index the index of the attribute to set cutpoints for
998	*/
999	protected void calculateCutPointsByEqualFrequencyBinning(int index) {
1000
1001	// Copy data so that it can be sorted
1002	Instances data = new Instances(getInputFormat());
1003
1004	// Sort input data
1005	data.sort(index);
1006
1007	// Compute weight of instances without missing values
1008	double sumOfWeights = 0;
1009	for (int i = 0; i < data.numInstances(); i++) {
1010	if (data.instance(i).isMissing(index)) {
1011	break;
1012	} else {
1013	sumOfWeights += data.instance(i).weight();
1014	}
1015	}
1016	double freq = sumOfWeights / m_NumBins;
1017
1018	// Compute break points
1019	double[] cutPoints = new double[m_NumBins - 1];
1020	double counter = 0;
1021	int cpindex = 0;
1022	for (int i = 0; i < data.numInstances() - 1; i++) {
1023
1024	// Stop if value missing
1025	if (data.instance(i).isMissing(index)) {
1026	break;
1027	}
1028	counter += data.instance(i).weight();
1029
1030	// Do we have a potential breakpoint?
1031	if (data.instance(i).value(index) <
1032	data.instance(i + 1).value(index)) {
1033	if (counter >= freq) {
1034	cutPoints[cpindex] = (data.instance(i).value(index) +
1035	data.instance(i + 1).value(index)) / 2;
1036	cpindex++;
1037	counter = counter - freq;
1038	}
1039	}
1040	}
1041
1042	// Did we find any cutpoints?
1043	if (cpindex == 0) {
1044	m_CutPoints[index] = null;
1045	} else {
1046	double[] cp = new double[cpindex];
1047	for (int i = 0; i < cpindex; i++) {
1048	cp[i] = cutPoints[i];
1049	}
1050	m_CutPoints[index] = cp;
1051	}
1052	}
1053
1054	/**
1055	* Optimizes the number of bins using leave-one-out cross-validation.
1056	*
1057	* @param index the attribute index
1058	*/
1059	protected void findNumBins(int index) {
1060
1061	double min = Double.MAX_VALUE, max = -Double.MIN_VALUE, binWidth = 0,
1062	entropy, bestEntropy = Double.MAX_VALUE, currentVal;
1063	double[] distribution;
1064	int bestNumBins = 1;
1065	Instance currentInstance;
1066
1067	// Find minimum and maximum
1068	for (int i = 0; i < getInputFormat().numInstances(); i++) {
1069	currentInstance = getInputFormat().instance(i);
1070	if (!currentInstance.isMissing(index)) {
1071	currentVal = currentInstance.value(index);
1072	if (currentVal > max) {
1073	max = currentVal;
1074	}
1075	if (currentVal < min) {
1076	min = currentVal;
1077	}
1078	}
1079	}
1080
1081	// Find best number of bins
1082	for (int i = 0; i < m_NumBins; i++) {
1083	distribution = new double[i + 1];
1084	binWidth = (max - min) / (i + 1);
1085
1086	// Compute distribution
1087	for (int j = 0; j < getInputFormat().numInstances(); j++) {
1088	currentInstance = getInputFormat().instance(j);
1089	if (!currentInstance.isMissing(index)) {
1090	for (int k = 0; k < i + 1; k++) {
1091	if (currentInstance.value(index) <=
1092	(min + (((double)k + 1) * binWidth))) {
1093	distribution[k] += currentInstance.weight();
1094	break;
1095	}
1096	}
1097	}
1098	}
1099
1100	// Compute cross-validated entropy
1101	entropy = 0;
1102	for (int k = 0; k < i + 1; k++) {
1103	if (distribution[k] < 2) {
1104	entropy = Double.MAX_VALUE;
1105	break;
1106	}
1107	entropy -= distribution[k] * Math.log((distribution[k] - 1) /
1108	binWidth);
1109	}
1110
1111	// Best entropy so far?
1112	if (entropy < bestEntropy) {
1113	bestEntropy = entropy;
1114	bestNumBins = i + 1;
1115	}
1116	}
1117
1118	// Compute cut points
1119	double [] cutPoints = null;
1120	if ((bestNumBins > 1) && (binWidth > 0)) {
1121	cutPoints = new double [bestNumBins - 1];
1122	for(int i = 1; i < bestNumBins; i++) {
1123	cutPoints[i - 1] = min + binWidth * i;
1124	}
1125	}
1126	m_CutPoints[index] = cutPoints;
1127	}
1128
1129	/**
1130	* Set the output format. Takes the currently defined cutpoints and
1131	* m_InputFormat and calls setOutputFormat(Instances) appropriately.
1132	*/
1133	protected void setOutputFormat() {
1134
1135	if (m_CutPoints == null) {
1136	setOutputFormat(null);
1137	return;
1138	}
1139	FastVector attributes = new FastVector(getInputFormat().numAttributes());
1140	int classIndex = getInputFormat().classIndex();
1141	for(int i = 0; i < getInputFormat().numAttributes(); i++) {
1142	if ((m_DiscretizeCols.isInRange(i))
1143	&& (getInputFormat().attribute(i).isNumeric())) {
1144	if (!m_MakeBinary) {
1145	FastVector attribValues = new FastVector(1);
1146	if (m_CutPoints[i] == null) {
1147	attribValues.addElement("'All'");
1148	} else {
1149	for(int j = 0; j <= m_CutPoints[i].length; j++) {
1150	if (j == 0) {
1151	attribValues.addElement("'(-inf-"
1152	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1153	} else if (j == m_CutPoints[i].length) {
1154	attribValues.addElement("'("
1155	+ Utils.doubleToString(m_CutPoints[i][j - 1], 6)
1156	+ "-inf)'");
1157	} else {
1158	attribValues.addElement("'("
1159	+ Utils.doubleToString(m_CutPoints[i][j - 1], 6) + "-"
1160	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1161	}
1162	}
1163	}
1164	attributes.addElement(new Attribute(getInputFormat().
1165	attribute(i).name(),
1166	attribValues));
1167	} else {
1168	if (m_CutPoints[i] == null) {
1169	FastVector attribValues = new FastVector(1);
1170	attribValues.addElement("'All'");
1171	attributes.addElement(new Attribute(getInputFormat().
1172	attribute(i).name(),
1173	attribValues));
1174	} else {
1175	if (i < getInputFormat().classIndex()) {
1176	classIndex += m_CutPoints[i].length - 1;
1177	}
1178	for(int j = 0; j < m_CutPoints[i].length; j++) {
1179	FastVector attribValues = new FastVector(2);
1180	attribValues.addElement("'(-inf-"
1181	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1182	attribValues.addElement("'("
1183	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "-inf)'");
1184	attributes.addElement(new Attribute(getInputFormat().
1185	attribute(i).name(),
1186	attribValues));
1187	}
1188	}
1189	}
1190	} else {
1191	attributes.addElement(getInputFormat().attribute(i).copy());
1192	}
1193	}
1194	Instances outputFormat =
1195	new Instances(getInputFormat().relationName(), attributes, 0);
1196	outputFormat.setClassIndex(classIndex);
1197	setOutputFormat(outputFormat);
1198	}
1199
1200	/**
1201	* Convert a single instance over. The converted instance is added to
1202	* the end of the output queue.
1203	*
1204	* @param instance the instance to convert
1205	*/
1206	protected void convertInstance(Instance instance) {
1207
1208	int index = 0;
1209	double [] vals = new double [outputFormatPeek().numAttributes()];
1210	// Copy and convert the values
1211	for(int i = 0; i < getInputFormat().numAttributes(); i++) {
1212	if (m_DiscretizeCols.isInRange(i) &&
1213	getInputFormat().attribute(i).isNumeric()) {
1214	int j;
1215	double currentVal = instance.value(i);
1216	if (m_CutPoints[i] == null) {
1217	if (instance.isMissing(i)) {
1218	vals[index] = Instance.missingValue();
1219	} else {
1220	vals[index] = 0;
1221	}
1222	index++;
1223	} else {
1224	if (!m_MakeBinary) {
1225	if (instance.isMissing(i)) {
1226	vals[index] = Instance.missingValue();
1227	} else {
1228	for (j = 0; j < m_CutPoints[i].length; j++) {
1229	if (currentVal <= m_CutPoints[i][j]) {
1230	break;
1231	}
1232	}
1233	vals[index] = j;
1234	}
1235	index++;
1236	} else {
1237	for (j = 0; j < m_CutPoints[i].length; j++) {
1238	if (instance.isMissing(i)) {
1239	vals[index] = Instance.missingValue();
1240	} else if (currentVal <= m_CutPoints[i][j]) {
1241	vals[index] = 0;
1242	} else {
1243	vals[index] = 1;
1244	}
1245	index++;
1246	}
1247	}
1248	}
1249	} else {
1250	vals[index] = instance.value(i);
1251	index++;
1252	}
1253	}
1254
1255	Instance inst = null;
1256	if (instance instanceof SparseInstance) {
1257	inst = new SparseInstance(instance.weight(), vals);
1258	} else {
1259	inst = new Instance(instance.weight(), vals);
1260	}
1261	copyStringValues(inst, false, instance.dataset(), getInputStringIndex(),
1262	getOutputFormat(), getOutputStringIndex());
1263	inst.setDataset(getOutputFormat());
1264	push(inst);
1265	}
1266
1267	/**
1268	* Main method for testing this class.
1269	*
1270	* @param argv should contain arguments to the filter: use -h for help
1271	*/
1272	public static void main(String [] argv) {
1273
1274	try {
1275	if (Utils.getFlag('b', argv)) {
1276	Filter.batchFilterFile(new DiscretizeFilter(), argv);
1277	} else {
1278	Filter.filterFile(new DiscretizeFilter(), argv);
1279	}
1280	} catch (Exception ex) {
1281	System.out.println(ex.getMessage());
1282	}
1283	}
1284	}
1285
1286
1287
1288
1289
1290
1291
1292

Note: See TracBrowser for help on using the repository browser.

Download in other formats: