source: trunk/gsdl/packages/kea/kea-3.0/weka/filters/DiscretizeFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 35.9 KB
Line 
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * DiscretizeFilter.java
19 * Copyright (C) 1999 Eibe Frank,Len Trigg
20 *
21 */
22
23
24package weka.filters;
25
26import java.io.*;
27import java.util.*;
28import weka.core.*;
29
30/**
31 * An instance filter that discretizes a range of numeric attributes in
32 * the dataset into nominal attributes. Discretization can be either by
33 * simple binning, or by Fayyad & Irani's MDL method (the default).<p>
34 *
35 * Valid filter-specific options are: <p>
36 *
37 * -B num <br>
38 * Specifies the (maximum) number of bins to divide numeric attributes into.
39 * (default: class-based discretisation).<p>
40 *
41 * -F <br>
42 * Use equal-frequency instead of equal-width discretization if
43 * class-based discretisation is turned off.<p>
44 *
45 * -O <br>
46 * Optimize the number of bins using a leave-one-out estimate of the
47 * entropy (for equal-width binning).<p>
48 *
49 * -R col1,col2-col4,... <br>
50 * Specifies list of columns to Discretize. First
51 * and last are valid indexes. (default: none) <p>
52 *
53 * -V <br>
54 * Invert matching sense.<p>
55 *
56 * -D <br>
57 * Make binary nominal attributes. <p>
58 *
59 * -E <br>
60 * Use better encoding of split point for MDL. <p>
61 *
62 * -K <br>
63 * Use Kononeko's MDL criterion. <p>
64 *
65 * @author Len Trigg ([email protected])
66 * @author Eibe Frank ([email protected])
67 * @version $Revision: 8815 $
68 */
69public class DiscretizeFilter extends Filter
70 implements OptionHandler, WeightedInstancesHandler {
71
72 /** Stores which columns to Discretize */
73 protected Range m_DiscretizeCols = new Range();
74
75 /** The number of bins to divide the attribute into */
76 protected int m_NumBins = 10;
77
78 /** Store the current cutpoints */
79 protected double [][] m_CutPoints = null;
80
81 /** True if discretisation will be done by MDL rather than binning */
82 protected boolean m_UseMDL = true;
83
84 /** Output binary attributes for discretized attributes. */
85 protected boolean m_MakeBinary = false;
86
87 /** Use better encoding of split point for MDL. */
88 protected boolean m_UseBetterEncoding = false;
89
90 /** Use Kononenko's MDL criterion instead of Fayyad et al.'s */
91 protected boolean m_UseKononenko = false;
92
93 /** Find the number of bins using cross-validated entropy. */
94 protected boolean m_FindNumBins = false;
95
96 /** Use equal-frequency binning if unsupervised discretization turned on */
97 protected boolean m_UseEqualFrequency = false;
98
99 /** Constructor - initialises the filter */
100 public DiscretizeFilter() {
101
102 setAttributeIndices("first-last");
103 }
104
105
106 /**
107 * Gets an enumeration describing the available options
108 *
109 * @return an enumeration of all the available options
110 */
111 public Enumeration listOptions() {
112
113 Vector newVector = new Vector(7);
114
115 newVector.addElement(new Option(
116 "\tSpecifies the (maximum) number of bins to divide numeric"
117 + " attributes into.\n"
118 + "\t(default class-based discretization)",
119 "B", 1, "-B <num>"));
120
121 newVector.addElement(new Option(
122 "\tUse equal-frequency instead of equal-width with\n"+
123 "\tunsupervised discretization.",
124 "F", 0, "-F"));
125
126 newVector.addElement(new Option(
127 "\tOptimize number of bins using leave-one-out estimate\n"+
128 "\tof estimated entropy (for equal-width discretization).",
129 "O", 0, "-O"));
130
131 /* If we decide to implement loading and saving cutfiles like
132 * the C Discretizer (which is probably not necessary)
133 newVector.addElement(new Option(
134 "\tSpecify that the cutpoints should be loaded from a file.",
135 "L", 1, "-L <file>"));
136 newVector.addElement(new Option(
137 "\tSpecify that the chosen cutpoints should be saved to a file.",
138 "S", 1, "-S <file>"));
139 */
140
141 newVector.addElement(new Option(
142 "\tSpecifies list of columns to Discretize. First"
143 + " and last are valid indexes.\n"
144 + "\t(default none)",
145 "R", 1, "-R <col1,col2-col4,...>"));
146
147 newVector.addElement(new Option(
148 "\tInvert matching sense of column indexes.",
149 "V", 0, "-V"));
150
151 newVector.addElement(new Option(
152 "\tOutput binary attributes for discretized attributes.",
153 "D", 0, "-D"));
154
155 newVector.addElement(new Option(
156 "\tUse better encoding of split point for MDL.",
157 "E", 0, "-E"));
158
159 newVector.addElement(new Option(
160 "\tUse Kononenko's MDL criterion.",
161 "K", 0, "-K"));
162
163 return newVector.elements();
164 }
165
166
167 /**
168 * Parses the options for this object. Valid options are: <p>
169 *
170 * -B num <br>
171 * Specifies the (maximum) number of bins to divide numeric attributes into.
172 * (default class-based discretisation).<p>
173 *
174 * -F <br>
175 * Use equal-frequency instead of equal-width discretization if
176 * class-based discretisation is turned off.<p>
177 *
178 * -O <br>
179 * Optimize the number of bins using a leave-one-out estimate of the
180 * entropy (for equal-width binning).<p>
181 *
182 * -R col1,col2-col4,... <br>
183 * Specifies list of columns to Discretize. First
184 * and last are valid indexes. (default none) <p>
185 *
186 * -V <br>
187 * Invert matching sense.<p>
188 *
189 * -D <br>
190 * Make binary nominal attributes. <p>
191 *
192 * -E <br>
193 * Use better encoding of split point for MDL. <p>
194 *
195 * -K <br>
196 * Use Kononeko's MDL criterion. <p>
197 *
198 * @param options the list of options as an array of strings
199 * @exception Exception if an option is not supported
200 */
201 public void setOptions(String[] options) throws Exception {
202
203 setMakeBinary(Utils.getFlag('D', options));
204 setUseEqualFrequency(Utils.getFlag('F', options));
205 setUseBetterEncoding(Utils.getFlag('E', options));
206 setUseKononenko(Utils.getFlag('K', options));
207 setFindNumBins(Utils.getFlag('O', options));
208 setInvertSelection(Utils.getFlag('V', options));
209 setUseMDL(true);
210
211 String numBins = Utils.getOption('B', options);
212 if (numBins.length() != 0) {
213 setBins(Integer.parseInt(numBins));
214 setUseMDL(false);
215 } else {
216 setBins(10);
217 }
218
219 String convertList = Utils.getOption('R', options);
220 if (convertList.length() != 0) {
221 setAttributeIndices(convertList);
222 } else {
223 setAttributeIndices("first-last");
224 }
225
226 if (getInputFormat() != null) {
227 setInputFormat(getInputFormat());
228 }
229 }
230 /**
231 * Gets the current settings of the filter.
232 *
233 * @return an array of strings suitable for passing to setOptions
234 */
235 public String [] getOptions() {
236
237 String [] options = new String [12];
238 int current = 0;
239
240 if (getMakeBinary()) {
241 options[current++] = "-D";
242 }
243 if (getUseEqualFrequency()) {
244 options[current++] = "-F";
245 }
246 if (getUseBetterEncoding()) {
247 options[current++] = "-E";
248 }
249 if (getUseKononenko()) {
250 options[current++] = "-K";
251 }
252 if (getFindNumBins()) {
253 options[current++] = "-O";
254 }
255 if (getInvertSelection()) {
256 options[current++] = "-V";
257 }
258 if (!getUseMDL()) {
259 options[current++] = "-B"; options[current++] = "" + getBins();
260 }
261 if (!getAttributeIndices().equals("")) {
262 options[current++] = "-R"; options[current++] = getAttributeIndices();
263 }
264 while (current < options.length) {
265 options[current++] = "";
266 }
267 return options;
268 }
269
270
271 /**
272 * Sets the format of the input instances.
273 *
274 * @param instanceInfo an Instances object containing the input instance
275 * structure (any instances contained in the object are ignored - only the
276 * structure is required).
277 * @return true if the outputFormat may be collected immediately
278 * @exception Exception if the input format can't be set successfully
279 */
280 public boolean setInputFormat(Instances instanceInfo) throws Exception {
281
282 super.setInputFormat(instanceInfo);
283
284 m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
285 m_CutPoints = null;
286 if (m_UseMDL) {
287 if (instanceInfo.classIndex() < 0) {
288 throw new UnassignedClassException("Cannot use class-based discretization: "
289 + "no class assigned to the dataset");
290 }
291 if (!instanceInfo.classAttribute().isNominal()) {
292 throw new UnsupportedClassTypeException("Supervised discretization not possible:"
293 + " class is not nominal!");
294 }
295 } else {
296 if (getFindNumBins() && getUseEqualFrequency()) {
297 throw new IllegalArgumentException("Bin number optimization in conjunction "+
298 "with equal-frequency binning not implemented.");
299 }
300 }
301
302 // If we implement loading cutfiles, then load
303 //them here and set the output format
304 return false;
305 }
306
307
308
309 /**
310 * Input an instance for filtering. Ordinarily the instance is processed
311 * and made available for output immediately. Some filters require all
312 * instances be read before producing output.
313 *
314 * @param instance the input instance
315 * @return true if the filtered instance may now be
316 * collected with output().
317 * @exception IllegalStateException if no input format has been defined.
318 */
319 public boolean input(Instance instance) {
320
321 if (getInputFormat() == null) {
322 throw new IllegalStateException("No input instance format defined");
323 }
324 if (m_NewBatch) {
325 resetQueue();
326 m_NewBatch = false;
327 }
328
329 if (m_CutPoints != null) {
330 convertInstance(instance);
331 return true;
332 }
333
334 bufferInput(instance);
335 return false;
336 }
337
338
339 /**
340 * Signifies that this batch of input to the filter is finished. If the
341 * filter requires all instances prior to filtering, output() may now
342 * be called to retrieve the filtered instances.
343 *
344 * @return true if there are instances pending output
345 * @exception IllegalStateException if no input structure has been defined
346 */
347 public boolean batchFinished() {
348
349 if (getInputFormat() == null) {
350 throw new IllegalStateException("No input instance format defined");
351 }
352 if (m_CutPoints == null) {
353 calculateCutPoints();
354
355 setOutputFormat();
356
357 // If we implement saving cutfiles, save the cuts here
358
359 // Convert pending input instances
360 for(int i = 0; i < getInputFormat().numInstances(); i++) {
361 convertInstance(getInputFormat().instance(i));
362 }
363 }
364 flushInput();
365
366 m_NewBatch = true;
367 return (numPendingOutput() != 0);
368 }
369
370 /**
371 * Returns a string describing this filter
372 *
373 * @return a description of the filter suitable for
374 * displaying in the explorer/experimenter gui
375 */
376 public String globalInfo() {
377
378 return "An instance filter that discretizes a range of numeric"
379 + " attributes in the dataset into nominal attributes."
380 + " Discretization can be either by simple binning, or by"
381 + " Fayyad & Irani's MDL method (the default).";
382 }
383
384 /**
385 * Returns the tip text for this property
386 *
387 * @return tip text for this property suitable for
388 * displaying in the explorer/experimenter gui
389 */
390 public String findNumBinsTipText() {
391
392 return "Optimize number of equal-width bins using leave-one-out.";
393 }
394
395 /**
396 * Get the value of FindNumBins.
397 *
398 * @return Value of FindNumBins.
399 */
400 public boolean getFindNumBins() {
401
402 return m_FindNumBins;
403 }
404
405 /**
406 * Set the value of FindNumBins.
407 *
408 * @param newFindNumBins Value to assign to FindNumBins.
409 */
410 public void setFindNumBins(boolean newFindNumBins) {
411
412 m_FindNumBins = newFindNumBins;
413 }
414
415 /**
416 * Returns the tip text for this property
417 *
418 * @return tip text for this property suitable for
419 * displaying in the explorer/experimenter gui
420 */
421 public String makeBinaryTipText() {
422
423 return "Make resulting attributes binary.";
424 }
425
426 /**
427 * Gets whether binary attributes should be made for discretized ones.
428 *
429 * @return true if attributes will be binarized
430 */
431 public boolean getMakeBinary() {
432
433 return m_MakeBinary;
434 }
435
436 /**
437 * Sets whether binary attributes should be made for discretized ones.
438 *
439 * @param makeBinary if binary attributes are to be made
440 */
441 public void setMakeBinary(boolean makeBinary) {
442
443 m_MakeBinary = makeBinary;
444 }
445
446 /**
447 * Returns the tip text for this property
448 *
449 * @return tip text for this property suitable for
450 * displaying in the explorer/experimenter gui
451 */
452 public String useMDLTipText() {
453
454 return "Use class-based discretization. If set to false, does"
455 + " not require a class attribute, and uses a fixed number"
456 + " of bins (according to bins setting).";
457 }
458
459 /**
460 * Gets whether MDL will be used as the discretisation method.
461 *
462 * @return true if so, false if fixed bins should be used.
463 */
464 public boolean getUseMDL() {
465
466 return m_UseMDL;
467 }
468
469 /**
470 * Sets whether MDL will be used as the discretisation method.
471 *
472 * @param useMDL true if MDL should be used, false if fixed bins should
473 * be used.
474 */
475 public void setUseMDL(boolean useMDL) {
476
477 m_UseMDL = useMDL;
478 }
479
480 /**
481 * Returns the tip text for this property
482 *
483 * @return tip text for this property suitable for
484 * displaying in the explorer/experimenter gui
485 */
486 public String useKononenkoTipText() {
487
488 return "Use Kononenko's MDL criterion. If set to false"
489 + " uses the Fayyad & Irani criterion.";
490 }
491
492 /**
493 * Get the value of UseEqualFrequency.
494 *
495 * @return Value of UseEqualFrequency.
496 */
497 public boolean getUseEqualFrequency() {
498
499 return m_UseEqualFrequency;
500 }
501
502 /**
503 * Set the value of UseEqualFrequency.
504 *
505 * @param newUseEqualFrequency Value to assign to UseEqualFrequency.
506 */
507 public void setUseEqualFrequency(boolean newUseEqualFrequency) {
508
509 m_UseEqualFrequency = newUseEqualFrequency;
510 }
511
512 /**
513 * Gets whether Kononenko's MDL criterion is to be used.
514 *
515 * @return true if Kononenko's criterion will be used.
516 */
517 public boolean getUseKononenko() {
518
519 return m_UseKononenko;
520 }
521
522 /**
523 * Sets whether Kononenko's MDL criterion is to be used.
524 *
525 * @param useKon true if Kononenko's one is to be used
526 */
527 public void setUseKononenko(boolean useKon) {
528
529 m_UseKononenko = useKon;
530 }
531
532 /**
533 * Returns the tip text for this property
534 *
535 * @return tip text for this property suitable for
536 * displaying in the explorer/experimenter gui
537 */
538 public String useBetterEncodingTipText() {
539
540 return "Uses a more efficient split point encoding.";
541 }
542
543 /**
544 * Gets whether better encoding is to be used for MDL.
545 *
546 * @return true if the better MDL encoding will be used
547 */
548 public boolean getUseBetterEncoding() {
549
550 return m_UseBetterEncoding;
551 }
552
553 /**
554 * Sets whether better encoding is to be used for MDL.
555 *
556 * @param useBetterEncoding true if better encoding to be used.
557 */
558 public void setUseBetterEncoding(boolean useBetterEncoding) {
559
560 m_UseBetterEncoding = useBetterEncoding;
561 }
562
563 /**
564 * Returns the tip text for this property
565 *
566 * @return tip text for this property suitable for
567 * displaying in the explorer/experimenter gui
568 */
569 public String binsTipText() {
570
571 return "Number of bins for class-blind discretisation. This"
572 + " setting is ignored if MDL-based discretisation is used.";
573 }
574
575 /**
576 * Gets the number of bins numeric attributes will be divided into
577 *
578 * @return the number of bins.
579 */
580 public int getBins() {
581
582 return m_NumBins;
583 }
584
585 /**
586 * Sets the number of bins to divide each selected numeric attribute into
587 *
588 * @param numBins the number of bins
589 */
590 public void setBins(int numBins) {
591
592 m_NumBins = numBins;
593 }
594
595 /**
596 * Returns the tip text for this property
597 *
598 * @return tip text for this property suitable for
599 * displaying in the explorer/experimenter gui
600 */
601 public String invertSelectionTipText() {
602
603 return "Set attribute selection mode. If false, only selected"
604 + " (numeric) attributes in the range will be discretized; if"
605 + " true, only non-selected attributes will be discretized.";
606 }
607
608 /**
609 * Gets whether the supplied columns are to be removed or kept
610 *
611 * @return true if the supplied columns will be kept
612 */
613 public boolean getInvertSelection() {
614
615 return m_DiscretizeCols.getInvert();
616 }
617
618 /**
619 * Sets whether selected columns should be removed or kept. If true the
620 * selected columns are kept and unselected columns are deleted. If false
621 * selected columns are deleted and unselected columns are kept.
622 *
623 * @param invert the new invert setting
624 */
625 public void setInvertSelection(boolean invert) {
626
627 m_DiscretizeCols.setInvert(invert);
628 }
629
630 /**
631 * Returns the tip text for this property
632 *
633 * @return tip text for this property suitable for
634 * displaying in the explorer/experimenter gui
635 */
636 public String attributeIndicesTipText() {
637 return "Specify range of attributes to act on."
638 + " This is a comma separated list of attribute indices, with"
639 + " \"first\" and \"last\" valid values. Specify an inclusive"
640 + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
641 }
642
643 /**
644 * Gets the current range selection
645 *
646 * @return a string containing a comma separated list of ranges
647 */
648 public String getAttributeIndices() {
649
650 return m_DiscretizeCols.getRanges();
651 }
652
653 /**
654 * Sets which attributes are to be Discretized (only numeric
655 * attributes among the selection will be Discretized).
656 *
657 * @param rangeList a string representing the list of attributes. Since
658 * the string will typically come from a user, attributes are indexed from
659 * 1. <br>
660 * eg: first-3,5,6-last
661 * @exception IllegalArgumentException if an invalid range list is supplied
662 */
663 public void setAttributeIndices(String rangeList) {
664
665 m_DiscretizeCols.setRanges(rangeList);
666 }
667
668 /**
669 * Sets which attributes are to be Discretized (only numeric
670 * attributes among the selection will be Discretized).
671 *
672 * @param attributes an array containing indexes of attributes to Discretize.
673 * Since the array will typically come from a program, attributes are indexed
674 * from 0.
675 * @exception IllegalArgumentException if an invalid set of ranges
676 * is supplied
677 */
678 public void setAttributeIndicesArray(int [] attributes) {
679
680 setAttributeIndices(Range.indicesToRangeList(attributes));
681 }
682
683 /**
684 * Gets the cut points for an attribute
685 *
686 * @param the index (from 0) of the attribute to get the cut points of
687 * @return an array containing the cutpoints (or null if the
688 * attribute requested isn't being Discretized
689 */
690 public double [] getCutPoints(int attributeIndex) {
691
692 if (m_CutPoints == null) {
693 return null;
694 }
695 return m_CutPoints[attributeIndex];
696 }
697
698 /** Generate the cutpoints for each attribute */
699 protected void calculateCutPoints() {
700
701 Instances copy = null;
702
703 m_CutPoints = new double [getInputFormat().numAttributes()] [];
704 for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) {
705 if ((m_DiscretizeCols.isInRange(i)) &&
706 (getInputFormat().attribute(i).isNumeric())) {
707 if (m_UseMDL) {
708
709 // Use copy to preserve order
710 if (copy == null) {
711 copy = new Instances(getInputFormat());
712 }
713 calculateCutPointsByMDL(i, copy);
714 } else {
715 if (m_FindNumBins) {
716 findNumBins(i);
717 } else if (!m_UseEqualFrequency) {
718 calculateCutPointsByEqualWidthBinning(i);
719 } else {
720 calculateCutPointsByEqualFrequencyBinning(i);
721 }
722 }
723 }
724 }
725 }
726
727 /**
728 * Set cutpoints for a single attribute using MDL.
729 *
730 * @param index the index of the attribute to set cutpoints for
731 */
732 protected void calculateCutPointsByMDL(int index,
733 Instances data) {
734
735 // Sort instances
736 data.sort(data.attribute(index));
737
738 // Find first instances that's missing
739 int firstMissing = data.numInstances();
740 for (int i = 0; i < data.numInstances(); i++) {
741 if (data.instance(i).isMissing(index)) {
742 firstMissing = i;
743 break;
744 }
745 }
746 m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
747 }
748
749 /** Test using Kononenko's MDL criterion. */
750 private boolean KononenkosMDL(double[] priorCounts,
751 double[][] bestCounts,
752 double numInstances,
753 int numCutPoints) {
754
755 double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
756 double before, after;
757 int numClassesTotal;
758
759 // Number of classes occuring in the set
760 numClassesTotal = 0;
761 for (int i = 0; i < priorCounts.length; i++) {
762 if (priorCounts[i] > 0) {
763 numClassesTotal++;
764 }
765 }
766
767 // Encode distribution prior to split
768 distPrior = SpecialFunctions.log2Binomial(numInstances
769 + numClassesTotal - 1,
770 numClassesTotal - 1);
771
772 // Encode instances prior to split.
773 instPrior = SpecialFunctions.log2Multinomial(numInstances,
774 priorCounts);
775
776 before = instPrior + distPrior;
777
778 // Encode distributions and instances after split.
779 for (int i = 0; i < bestCounts.length; i++) {
780 sum = Utils.sum(bestCounts[i]);
781 distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1,
782 numClassesTotal - 1);
783 instAfter += SpecialFunctions.log2Multinomial(sum,
784 bestCounts[i]);
785 }
786
787 // Coding cost after split
788 after = Utils.log2(numCutPoints) + distAfter + instAfter;
789
790 // Check if split is to be accepted
791 return (Utils.gr(before, after));
792 }
793
794
795 /** Test using Fayyad and Irani's MDL criterion. */
796 private boolean FayyadAndIranisMDL(double[] priorCounts,
797 double[][] bestCounts,
798 double numInstances,
799 int numCutPoints) {
800
801 double priorEntropy, entropy, gain;
802 double entropyLeft, entropyRight, delta;
803 int numClassesTotal, numClassesRight, numClassesLeft;
804
805 // Compute entropy before split.
806 priorEntropy = ContingencyTables.entropy(priorCounts);
807
808 // Compute entropy after split.
809 entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);
810
811 // Compute information gain.
812 gain = priorEntropy - entropy;
813
814 // Number of classes occuring in the set
815 numClassesTotal = 0;
816 for (int i = 0; i < priorCounts.length; i++) {
817 if (priorCounts[i] > 0) {
818 numClassesTotal++;
819 }
820 }
821
822 // Number of classes occuring in the left subset
823 numClassesLeft = 0;
824 for (int i = 0; i < bestCounts[0].length; i++) {
825 if (bestCounts[0][i] > 0) {
826 numClassesLeft++;
827 }
828 }
829
830 // Number of classes occuring in the right subset
831 numClassesRight = 0;
832 for (int i = 0; i < bestCounts[1].length; i++) {
833 if (bestCounts[1][i] > 0) {
834 numClassesRight++;
835 }
836 }
837
838 // Entropy of the left and the right subsets
839 entropyLeft = ContingencyTables.entropy(bestCounts[0]);
840 entropyRight = ContingencyTables.entropy(bestCounts[1]);
841
842 // Compute terms for MDL formula
843 delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) -
844 (((double) numClassesTotal * priorEntropy) -
845 (numClassesRight * entropyRight) -
846 (numClassesLeft * entropyLeft));
847
848 // Check if split is to be accepted
849 return (Utils.gr(gain, (Utils.log2(numCutPoints) + delta) /
850 (double)numInstances));
851 }
852
853
854 /** Selects cutpoints for sorted subset. */
855 private double[] cutPointsForSubset(Instances instances, int attIndex,
856 int first, int lastPlusOne) {
857
858 double[][] counts, bestCounts;
859 double[] priorCounts, left, right, cutPoints;
860 double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1,
861 currentEntropy, bestEntropy, priorEntropy, gain;
862 int bestIndex = -1, numInstances = 0, numCutPoints = 0;
863
864 // Compute number of instances in set
865 if ((lastPlusOne - first) < 2) {
866 return null;
867 }
868
869 // Compute class counts.
870 counts = new double[2][instances.numClasses()];
871 for (int i = first; i < lastPlusOne; i++) {
872 numInstances += instances.instance(i).weight();
873 counts[1][(int)instances.instance(i).classValue()] +=
874 instances.instance(i).weight();
875 }
876
877 // Save prior counts
878 priorCounts = new double[instances.numClasses()];
879 System.arraycopy(counts[1], 0, priorCounts, 0,
880 instances.numClasses());
881
882 // Entropy of the full set
883 priorEntropy = ContingencyTables.entropy(priorCounts);
884 bestEntropy = priorEntropy;
885
886 // Find best entropy.
887 bestCounts = new double[2][instances.numClasses()];
888 for (int i = first; i < (lastPlusOne - 1); i++) {
889 counts[0][(int)instances.instance(i).classValue()] +=
890 instances.instance(i).weight();
891 counts[1][(int)instances.instance(i).classValue()] -=
892 instances.instance(i).weight();
893 if (Utils.sm(instances.instance(i).value(attIndex),
894 instances.instance(i + 1).value(attIndex))) {
895 currentCutPoint = instances.instance(i).value(attIndex); //+
896 //instances.instance(i + 1).value(attIndex)) / 2.0;
897 currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
898 if (Utils.sm(currentEntropy, bestEntropy)) {
899 bestCutPoint = currentCutPoint;
900 bestEntropy = currentEntropy;
901 bestIndex = i;
902 System.arraycopy(counts[0], 0,
903 bestCounts[0], 0, instances.numClasses());
904 System.arraycopy(counts[1], 0,
905 bestCounts[1], 0, instances.numClasses());
906 }
907 numCutPoints++;
908 }
909 }
910
911 // Use worse encoding?
912 if (!m_UseBetterEncoding) {
913 numCutPoints = (lastPlusOne - first) - 1;
914 }
915
916 // Checks if gain is zero
917 gain = priorEntropy - bestEntropy;
918 if (Utils.eq(gain, 0)) {
919 return null;
920 }
921
922 // Check if split is to be accepted
923 if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts,
924 numInstances, numCutPoints)) ||
925 (!m_UseKononenko && FayyadAndIranisMDL(priorCounts, bestCounts,
926 numInstances, numCutPoints))) {
927
928 // Select split points for the left and right subsets
929 left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1);
930 right = cutPointsForSubset(instances, attIndex,
931 bestIndex + 1, lastPlusOne);
932
933 // Merge cutpoints and return them
934 if ((left == null) && (right) == null) {
935 cutPoints = new double[1];
936 cutPoints[0] = bestCutPoint;
937 } else if (right == null) {
938 cutPoints = new double[left.length + 1];
939 System.arraycopy(left, 0, cutPoints, 0, left.length);
940 cutPoints[left.length] = bestCutPoint;
941 } else if (left == null) {
942 cutPoints = new double[1 + right.length];
943 cutPoints[0] = bestCutPoint;
944 System.arraycopy(right, 0, cutPoints, 1, right.length);
945 } else {
946 cutPoints = new double[left.length + right.length + 1];
947 cutPoints = new double[left.length + right.length + 1];
948 System.arraycopy(left, 0, cutPoints, 0, left.length);
949 cutPoints[left.length] = bestCutPoint;
950 System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
951 }
952
953 return cutPoints;
954 } else
955 return null;
956 }
957
958 /**
959 * Set cutpoints for a single attribute.
960 *
961 * @param index the index of the attribute to set cutpoints for
962 */
963 protected void calculateCutPointsByEqualWidthBinning(int index) {
964
965 // Scan for max and min values
966 double max = 0, min = 1, currentVal;
967 Instance currentInstance;
968 for(int i = 0; i < getInputFormat().numInstances(); i++) {
969 currentInstance = getInputFormat().instance(i);
970 if (!currentInstance.isMissing(index)) {
971 currentVal = currentInstance.value(index);
972 if (max < min) {
973 max = min = currentVal;
974 }
975 if (currentVal > max) {
976 max = currentVal;
977 }
978 if (currentVal < min) {
979 min = currentVal;
980 }
981 }
982 }
983 double binWidth = (max - min) / m_NumBins;
984 double [] cutPoints = null;
985 if ((m_NumBins > 1) && (binWidth > 0)) {
986 cutPoints = new double [m_NumBins - 1];
987 for(int i = 1; i < m_NumBins; i++) {
988 cutPoints[i - 1] = min + binWidth * i;
989 }
990 }
991 m_CutPoints[index] = cutPoints;
992 }
993
994 /**
995 * Set cutpoints for a single attribute.
996 *
997 * @param index the index of the attribute to set cutpoints for
998 */
999 protected void calculateCutPointsByEqualFrequencyBinning(int index) {
1000
1001 // Copy data so that it can be sorted
1002 Instances data = new Instances(getInputFormat());
1003
1004 // Sort input data
1005 data.sort(index);
1006
1007 // Compute weight of instances without missing values
1008 double sumOfWeights = 0;
1009 for (int i = 0; i < data.numInstances(); i++) {
1010 if (data.instance(i).isMissing(index)) {
1011 break;
1012 } else {
1013 sumOfWeights += data.instance(i).weight();
1014 }
1015 }
1016 double freq = sumOfWeights / m_NumBins;
1017
1018 // Compute break points
1019 double[] cutPoints = new double[m_NumBins - 1];
1020 double counter = 0;
1021 int cpindex = 0;
1022 for (int i = 0; i < data.numInstances() - 1; i++) {
1023
1024 // Stop if value missing
1025 if (data.instance(i).isMissing(index)) {
1026 break;
1027 }
1028 counter += data.instance(i).weight();
1029
1030 // Do we have a potential breakpoint?
1031 if (data.instance(i).value(index) <
1032 data.instance(i + 1).value(index)) {
1033 if (counter >= freq) {
1034 cutPoints[cpindex] = (data.instance(i).value(index) +
1035 data.instance(i + 1).value(index)) / 2;
1036 cpindex++;
1037 counter = counter - freq;
1038 }
1039 }
1040 }
1041
1042 // Did we find any cutpoints?
1043 if (cpindex == 0) {
1044 m_CutPoints[index] = null;
1045 } else {
1046 double[] cp = new double[cpindex];
1047 for (int i = 0; i < cpindex; i++) {
1048 cp[i] = cutPoints[i];
1049 }
1050 m_CutPoints[index] = cp;
1051 }
1052 }
1053
1054 /**
1055 * Optimizes the number of bins using leave-one-out cross-validation.
1056 *
1057 * @param index the attribute index
1058 */
1059 protected void findNumBins(int index) {
1060
1061 double min = Double.MAX_VALUE, max = -Double.MIN_VALUE, binWidth = 0,
1062 entropy, bestEntropy = Double.MAX_VALUE, currentVal;
1063 double[] distribution;
1064 int bestNumBins = 1;
1065 Instance currentInstance;
1066
1067 // Find minimum and maximum
1068 for (int i = 0; i < getInputFormat().numInstances(); i++) {
1069 currentInstance = getInputFormat().instance(i);
1070 if (!currentInstance.isMissing(index)) {
1071 currentVal = currentInstance.value(index);
1072 if (currentVal > max) {
1073 max = currentVal;
1074 }
1075 if (currentVal < min) {
1076 min = currentVal;
1077 }
1078 }
1079 }
1080
1081 // Find best number of bins
1082 for (int i = 0; i < m_NumBins; i++) {
1083 distribution = new double[i + 1];
1084 binWidth = (max - min) / (i + 1);
1085
1086 // Compute distribution
1087 for (int j = 0; j < getInputFormat().numInstances(); j++) {
1088 currentInstance = getInputFormat().instance(j);
1089 if (!currentInstance.isMissing(index)) {
1090 for (int k = 0; k < i + 1; k++) {
1091 if (currentInstance.value(index) <=
1092 (min + (((double)k + 1) * binWidth))) {
1093 distribution[k] += currentInstance.weight();
1094 break;
1095 }
1096 }
1097 }
1098 }
1099
1100 // Compute cross-validated entropy
1101 entropy = 0;
1102 for (int k = 0; k < i + 1; k++) {
1103 if (distribution[k] < 2) {
1104 entropy = Double.MAX_VALUE;
1105 break;
1106 }
1107 entropy -= distribution[k] * Math.log((distribution[k] - 1) /
1108 binWidth);
1109 }
1110
1111 // Best entropy so far?
1112 if (entropy < bestEntropy) {
1113 bestEntropy = entropy;
1114 bestNumBins = i + 1;
1115 }
1116 }
1117
1118 // Compute cut points
1119 double [] cutPoints = null;
1120 if ((bestNumBins > 1) && (binWidth > 0)) {
1121 cutPoints = new double [bestNumBins - 1];
1122 for(int i = 1; i < bestNumBins; i++) {
1123 cutPoints[i - 1] = min + binWidth * i;
1124 }
1125 }
1126 m_CutPoints[index] = cutPoints;
1127 }
1128
1129 /**
1130 * Set the output format. Takes the currently defined cutpoints and
1131 * m_InputFormat and calls setOutputFormat(Instances) appropriately.
1132 */
1133 protected void setOutputFormat() {
1134
1135 if (m_CutPoints == null) {
1136 setOutputFormat(null);
1137 return;
1138 }
1139 FastVector attributes = new FastVector(getInputFormat().numAttributes());
1140 int classIndex = getInputFormat().classIndex();
1141 for(int i = 0; i < getInputFormat().numAttributes(); i++) {
1142 if ((m_DiscretizeCols.isInRange(i))
1143 && (getInputFormat().attribute(i).isNumeric())) {
1144 if (!m_MakeBinary) {
1145 FastVector attribValues = new FastVector(1);
1146 if (m_CutPoints[i] == null) {
1147 attribValues.addElement("'All'");
1148 } else {
1149 for(int j = 0; j <= m_CutPoints[i].length; j++) {
1150 if (j == 0) {
1151 attribValues.addElement("'(-inf-"
1152 + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1153 } else if (j == m_CutPoints[i].length) {
1154 attribValues.addElement("'("
1155 + Utils.doubleToString(m_CutPoints[i][j - 1], 6)
1156 + "-inf)'");
1157 } else {
1158 attribValues.addElement("'("
1159 + Utils.doubleToString(m_CutPoints[i][j - 1], 6) + "-"
1160 + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1161 }
1162 }
1163 }
1164 attributes.addElement(new Attribute(getInputFormat().
1165 attribute(i).name(),
1166 attribValues));
1167 } else {
1168 if (m_CutPoints[i] == null) {
1169 FastVector attribValues = new FastVector(1);
1170 attribValues.addElement("'All'");
1171 attributes.addElement(new Attribute(getInputFormat().
1172 attribute(i).name(),
1173 attribValues));
1174 } else {
1175 if (i < getInputFormat().classIndex()) {
1176 classIndex += m_CutPoints[i].length - 1;
1177 }
1178 for(int j = 0; j < m_CutPoints[i].length; j++) {
1179 FastVector attribValues = new FastVector(2);
1180 attribValues.addElement("'(-inf-"
1181 + Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
1182 attribValues.addElement("'("
1183 + Utils.doubleToString(m_CutPoints[i][j], 6) + "-inf)'");
1184 attributes.addElement(new Attribute(getInputFormat().
1185 attribute(i).name(),
1186 attribValues));
1187 }
1188 }
1189 }
1190 } else {
1191 attributes.addElement(getInputFormat().attribute(i).copy());
1192 }
1193 }
1194 Instances outputFormat =
1195 new Instances(getInputFormat().relationName(), attributes, 0);
1196 outputFormat.setClassIndex(classIndex);
1197 setOutputFormat(outputFormat);
1198 }
1199
1200 /**
1201 * Convert a single instance over. The converted instance is added to
1202 * the end of the output queue.
1203 *
1204 * @param instance the instance to convert
1205 */
1206 protected void convertInstance(Instance instance) {
1207
1208 int index = 0;
1209 double [] vals = new double [outputFormatPeek().numAttributes()];
1210 // Copy and convert the values
1211 for(int i = 0; i < getInputFormat().numAttributes(); i++) {
1212 if (m_DiscretizeCols.isInRange(i) &&
1213 getInputFormat().attribute(i).isNumeric()) {
1214 int j;
1215 double currentVal = instance.value(i);
1216 if (m_CutPoints[i] == null) {
1217 if (instance.isMissing(i)) {
1218 vals[index] = Instance.missingValue();
1219 } else {
1220 vals[index] = 0;
1221 }
1222 index++;
1223 } else {
1224 if (!m_MakeBinary) {
1225 if (instance.isMissing(i)) {
1226 vals[index] = Instance.missingValue();
1227 } else {
1228 for (j = 0; j < m_CutPoints[i].length; j++) {
1229 if (currentVal <= m_CutPoints[i][j]) {
1230 break;
1231 }
1232 }
1233 vals[index] = j;
1234 }
1235 index++;
1236 } else {
1237 for (j = 0; j < m_CutPoints[i].length; j++) {
1238 if (instance.isMissing(i)) {
1239 vals[index] = Instance.missingValue();
1240 } else if (currentVal <= m_CutPoints[i][j]) {
1241 vals[index] = 0;
1242 } else {
1243 vals[index] = 1;
1244 }
1245 index++;
1246 }
1247 }
1248 }
1249 } else {
1250 vals[index] = instance.value(i);
1251 index++;
1252 }
1253 }
1254
1255 Instance inst = null;
1256 if (instance instanceof SparseInstance) {
1257 inst = new SparseInstance(instance.weight(), vals);
1258 } else {
1259 inst = new Instance(instance.weight(), vals);
1260 }
1261 copyStringValues(inst, false, instance.dataset(), getInputStringIndex(),
1262 getOutputFormat(), getOutputStringIndex());
1263 inst.setDataset(getOutputFormat());
1264 push(inst);
1265 }
1266
1267 /**
1268 * Main method for testing this class.
1269 *
1270 * @param argv should contain arguments to the filter: use -h for help
1271 */
1272 public static void main(String [] argv) {
1273
1274 try {
1275 if (Utils.getFlag('b', argv)) {
1276 Filter.batchFilterFile(new DiscretizeFilter(), argv);
1277 } else {
1278 Filter.filterFile(new DiscretizeFilter(), argv);
1279 }
1280 } catch (Exception ex) {
1281 System.out.println(ex.getMessage());
1282 }
1283 }
1284}
1285
1286
1287
1288
1289
1290
1291
1292
Note: See TracBrowser for help on using the repository browser.