source: trunk/gsdl/packages/kea/kea-3.0/weka/core/Instances.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 63.2 KB
Line 
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * Instances.java
19 * Copyright (C) 1999 Eibe Frank
20 *
21 */
22
23package weka.core;
24
25import java.io.*;
26import java.util.*;
27
28/**
29 * Class for handling an ordered set of weighted instances. <p>
30 *
31 * Typical usage (code from the main() method of this class): <p>
32 *
33 * <code>
34 * ... <br>
35 *
36 * // Read all the instances in the file <br>
37 * reader = new FileReader(filename); <br>
38 * instances = new Instances(reader); <br><br>
39 *
40 * // Make the last attribute be the class <br>
41 * instances.setClassIndex(instances.numAttributes() - 1); <br><br>
42 *
43 * // Print header and instances. <br>
44 * System.out.println("\nDataset:\n"); <br>
45 * System.out.println(instances); <br><br>
46 *
47 * ... <br>
48 * </code><p>
49 *
50 * All methods that change a set of instances are safe, ie. a change
51 * of a set of instances does not affect any other sets of
52 * instances. All methods that change a datasets's attribute
53 * information clone the dataset before it is changed.
54 *
55 * @author Eibe Frank ([email protected])
56 * @author Len Trigg ([email protected])
57 * @version $Revision: 8815 $
58 */
59public class Instances implements Serializable {
60
61 /** The filename extension that should be used for arff files */
62 public static String FILE_EXTENSION = ".arff";
63
64 /** The dataset's name. */
65 protected String m_RelationName;
66
67 /** The attribute information. */
68 protected FastVector m_Attributes;
69
70 /** The instances. */
71 protected FastVector m_Instances;
72
73 /** The class attribute's index */
74 protected int m_ClassIndex;
75
76 /** Buffer of values for sparse instance */
77 protected double[] m_ValueBuffer;
78
79 /** Buffer of indices for sparse instance */
80 protected int[] m_IndicesBuffer;
81
82 /**
83 * Reads an ARFF file from a reader, and assigns a weight of
84 * one to each instance. Lets the index of the class
85 * attribute be undefined (negative).
86 *
87 * @param reader the reader
88 * @exception IOException if the ARFF file is not read
89 * successfully
90 */
91 public Instances(Reader reader) throws IOException {
92
93 StreamTokenizer tokenizer;
94
95 tokenizer = new StreamTokenizer(reader);
96 initTokenizer(tokenizer);
97 readHeader(tokenizer);
98 m_ClassIndex = -1;
99 m_Instances = new FastVector(1000);
100 while (getInstance(tokenizer, true)) {};
101 compactify();
102 }
103
104 /**
105 * Reads the header of an ARFF file from a reader and
106 * reserves space for the given number of instances. Lets
107 * the class index be undefined (negative).
108 *
109 * @param reader the reader
110 * @param capacity the capacity
111 * @exception IllegalArgumentException if the header is not read successfully
112 * or the capacity is negative.
113 * @exception IOException if there is a problem with the reader.
114 */
115 public Instances(Reader reader, int capacity) throws IOException {
116
117 StreamTokenizer tokenizer;
118
119 if (capacity < 0) {
120 throw new IllegalArgumentException("Capacity has to be positive!");
121 }
122 tokenizer = new StreamTokenizer(reader);
123 initTokenizer(tokenizer);
124 readHeader(tokenizer);
125 m_ClassIndex = -1;
126 m_Instances = new FastVector(capacity);
127 }
128
129 /**
130 * Constructor copying all instances and references to
131 * the header information from the given set of instances.
132 *
133 * @param instances the set to be copied
134 */
135 public Instances(Instances dataset) {
136
137 this(dataset, dataset.numInstances());
138
139 dataset.copyInstances(0, this, dataset.numInstances());
140 }
141
142 /**
143 * Constructor creating an empty set of instances. Copies references
144 * to the header information from the given set of instances. Sets
145 * the capacity of the set of instances to 0 if its negative.
146 *
147 * @param instances the instances from which the header
148 * information is to be taken
149 * @param capacity the capacity of the new dataset
150 */
151 public Instances(Instances dataset, int capacity) {
152
153 if (capacity < 0) {
154 capacity = 0;
155 }
156
157 // Strings only have to be "shallow" copied because
158 // they can't be modified.
159 m_ClassIndex = dataset.m_ClassIndex;
160 m_RelationName = dataset.m_RelationName;
161 m_Attributes = dataset.m_Attributes;
162 m_Instances = new FastVector(capacity);
163 }
164
165 /**
166 * Creates a new set of instances by copying a
167 * subset of another set.
168 *
169 * @param source the set of instances from which a subset
170 * is to be created
171 * @param first the index of the first instance to be copied
172 * @param toCopy the number of instances to be copied
173 * @exception IllegalArgumentException if first and toCopy are out of range
174 */
175 public Instances(Instances source, int first, int toCopy) {
176
177 this(source, toCopy);
178
179 if ((first < 0) || ((first + toCopy) > source.numInstances())) {
180 throw new IllegalArgumentException("Parameters first and/or toCopy out "+
181 "of range");
182 }
183 source.copyInstances(first, this, toCopy);
184 }
185
186 /**
187 * Creates an empty set of instances. Uses the given
188 * attribute information. Sets the capacity of the set of
189 * instances to 0 if its negative. Given attribute information
190 * must not be changed after this constructor has been used.
191 *
192 * @param name the name of the relation
193 * @param attInfo the attribute information
194 * @param capacity the capacity of the set
195 */
196 public Instances(String name, FastVector attInfo, int capacity) {
197
198 m_RelationName = name;
199 m_ClassIndex = -1;
200 m_Attributes = attInfo;
201 for (int i = 0; i < numAttributes(); i++) {
202 attribute(i).setIndex(i);
203 }
204 m_Instances = new FastVector(capacity);
205 }
206
207 /**
208 * Create a copy of the structure, but "cleanse" string types (i.e.
209 * doesn't contain references to the strings seen in the past).
210 *
211 * @return a copy of the instance structure.
212 */
213 public Instances stringFreeStructure() {
214
215 FastVector atts = (FastVector)m_Attributes.copy();
216 for (int i = 0 ; i < atts.size(); i++) {
217 Attribute att = (Attribute)atts.elementAt(i);
218 if (att.type() == Attribute.STRING) {
219 atts.setElementAt(new Attribute(att.name(), null), i);
220 }
221 }
222 Instances result = new Instances(relationName(), atts, 0);
223 result.m_ClassIndex = m_ClassIndex;
224 return result;
225 }
226
227 /**
228 * Adds one instance to the end of the set.
229 * Shallow copies instance before it is added. Increases the
230 * size of the dataset if it is not large enough. Does not
231 * check if the instance is compatible with the dataset.
232 *
233 * @param instance the instance to be added
234 */
235 public final void add(Instance instance) {
236
237 Instance newInstance = (Instance)instance.copy();
238
239 newInstance.setDataset(this);
240 m_Instances.addElement(newInstance);
241 }
242
243 /**
244 * Returns an attribute.
245 *
246 * @param index the attribute's index
247 * @return the attribute at the given position
248 */
249 public final Attribute attribute(int index) {
250
251 return (Attribute) m_Attributes.elementAt(index);
252 }
253
254 /**
255 * Returns an attribute given its name. If there is more than
256 * one attribute with the same name, it returns the first one.
257 * Returns null if the attribute can't be found.
258 *
259 * @param name the attribute's name
260 * @return the attribute with the given name, null if the
261 * attribute can't be found
262 */
263 public final Attribute attribute(String name) {
264
265 for (int i = 0; i < numAttributes(); i++) {
266 if (attribute(i).name().equals(name)) {
267 return attribute(i);
268 }
269 }
270 return null;
271 }
272
273 /**
274 * Checks for string attributes in the dataset
275 *
276 * @return true if string attributes are present, false otherwise
277 */
278 public boolean checkForStringAttributes() {
279
280 int i = 0;
281
282 while (i < m_Attributes.size()) {
283 if (attribute(i++).isString()) {
284 return true;
285 }
286 }
287 return false;
288 }
289
290 /**
291 * Checks if the given instance is compatible
292 * with this dataset. Only looks at the size of
293 * the instance and the ranges of the values for
294 * nominal and string attributes.
295 *
296 * @return true if the instance is compatible with the dataset
297 */
298 public final boolean checkInstance(Instance instance) {
299
300 if (instance.numAttributes() != numAttributes()) {
301 return false;
302 }
303 for (int i = 0; i < numAttributes(); i++) {
304 if (instance.isMissing(i)) {
305 continue;
306 } else if (attribute(i).isNominal() ||
307 attribute(i).isString()) {
308 if (!(Utils.eq(instance.value(i),
309 (double)(int)instance.value(i)))) {
310 return false;
311 } else if (Utils.sm(instance.value(i), 0) ||
312 Utils.gr(instance.value(i),
313 attribute(i).numValues())) {
314 return false;
315 }
316 }
317 }
318 return true;
319 }
320
321 /**
322 * Returns the class attribute.
323 *
324 * @return the class attribute
325 * @exception UnassignedClassException if the class is not set
326 */
327 public final Attribute classAttribute() {
328
329 if (m_ClassIndex < 0) {
330 throw new UnassignedClassException("Class index is negative (not set)!");
331 }
332 return attribute(m_ClassIndex);
333 }
334
335 /**
336 * Returns the class attribute's index. Returns negative number
337 * if it's undefined.
338 *
339 * @return the class index as an integer
340 */
341 public final int classIndex() {
342
343 return m_ClassIndex;
344 }
345
346 /**
347 * Compactifies the set of instances. Decreases the capacity of
348 * the set so that it matches the number of instances in the set.
349 */
350 public final void compactify() {
351
352 m_Instances.trimToSize();
353 }
354
355 /**
356 * Removes all instances from the set.
357 */
358 public final void delete() {
359
360 m_Instances = new FastVector();
361 }
362
363 /**
364 * Removes an instance at the given position from the set.
365 *
366 * @param index the instance's position
367 */
368 public final void delete(int index) {
369
370 m_Instances.removeElementAt(index);
371 }
372
373 /**
374 * Deletes an attribute at the given position
375 * (0 to numAttributes() - 1). A deep copy of the attribute
376 * information is performed before the attribute is deleted.
377 *
378 * @param pos the attribute's position
379 * @exception IllegalArgumentException if the given index is out of range or the
380 * class attribute is being deleted
381 */
382 public void deleteAttributeAt(int position) {
383
384 if ((position < 0) || (position >= m_Attributes.size())) {
385 throw new IllegalArgumentException("Index out of range");
386 }
387 if (position == m_ClassIndex) {
388 throw new IllegalArgumentException("Can't delete class attribute");
389 }
390 freshAttributeInfo();
391 if (m_ClassIndex > position) {
392 m_ClassIndex--;
393 }
394 m_Attributes.removeElementAt(position);
395 for (int i = position; i < m_Attributes.size(); i++) {
396 Attribute current = (Attribute)m_Attributes.elementAt(i);
397 current.setIndex(current.index() - 1);
398 }
399 for (int i = 0; i < numInstances(); i++) {
400 instance(i).forceDeleteAttributeAt(position);
401 }
402 }
403
404 /**
405 * Deletes all string attributes in the dataset. A deep copy of the attribute
406 * information is performed before an attribute is deleted.
407 *
408 * @exception IllegalArgumentException if string attribute couldn't be
409 * successfully deleted (probably because it is the class attribute).
410 */
411 public void deleteStringAttributes() {
412
413 int i = 0;
414 while (i < m_Attributes.size()) {
415 if (attribute(i).isString()) {
416 deleteAttributeAt(i);
417 } else {
418 i++;
419 }
420 }
421 }
422
423 /**
424 * Removes all instances with missing values for a particular
425 * attribute from the dataset.
426 *
427 * @param attIndex the attribute's index
428 */
429 public final void deleteWithMissing(int attIndex) {
430
431 FastVector newInstances = new FastVector(numInstances());
432
433 for (int i = 0; i < numInstances(); i++) {
434 if (!instance(i).isMissing(attIndex)) {
435 newInstances.addElement(instance(i));
436 }
437 }
438 m_Instances = newInstances;
439 }
440
441 /**
442 * Removes all instances with missing values for a particular
443 * attribute from the dataset.
444 *
445 * @param att the attribute
446 */
447 public final void deleteWithMissing(Attribute att) {
448
449 deleteWithMissing(att.index());
450 }
451
452 /**
453 * Removes all instances with a missing class value
454 * from the dataset.
455 *
456 * @exception UnassignedClassException if class is not set
457 */
458 public final void deleteWithMissingClass() {
459
460 if (m_ClassIndex < 0) {
461 throw new UnassignedClassException("Class index is negative (not set)!");
462 }
463 deleteWithMissing(m_ClassIndex);
464 }
465
466 /**
467 * Returns an enumeration of all the attributes.
468 *
469 * @return enumeration of all the attributes.
470 */
471 public Enumeration enumerateAttributes() {
472
473 return m_Attributes.elements(m_ClassIndex);
474 }
475
476 /**
477 * Returns an enumeration of all instances in the dataset.
478 *
479 * @return enumeration of all instances in the dataset
480 */
481 public final Enumeration enumerateInstances() {
482
483 return m_Instances.elements();
484 }
485
486 /**
487 * Checks if two headers are equivalent.
488 *
489 * @param dataset another dataset
490 * @return true if the header of the given dataset is equivalent
491 * to this header
492 */
493 public final boolean equalHeaders(Instances dataset){
494
495 // Check class and all attributes
496 if (m_ClassIndex != dataset.m_ClassIndex) {
497 return false;
498 }
499 if (m_Attributes.size() != dataset.m_Attributes.size()) {
500 return false;
501 }
502 for (int i = 0; i < m_Attributes.size(); i++) {
503 if (!(attribute(i).equals(dataset.attribute(i)))) {
504 return false;
505 }
506 }
507 return true;
508 }
509
510 /**
511 * Returns the first instance in the set.
512 *
513 * @return the first instance in the set
514 */
515 public final Instance firstInstance() {
516
517 return (Instance)m_Instances.firstElement();
518 }
519
520 /**
521 * Inserts an attribute at the given position (0 to
522 * numAttributes()) and sets all values to be missing.
523 * Shallow copies the attribute before it is inserted, and performs
524 * a deep copy of the existing attribute information.
525 *
526 * @param att the attribute to be inserted
527 * @param pos the attribute's position
528 * @exception IllegalArgumentException if the given index is out of range
529 */
530 public void insertAttributeAt(Attribute att, int position) {
531
532 if ((position < 0) ||
533 (position > m_Attributes.size())) {
534 throw new IllegalArgumentException("Index out of range");
535 }
536 att = (Attribute)att.copy();
537 freshAttributeInfo();
538 att.setIndex(position);
539 m_Attributes.insertElementAt(att, position);
540 for (int i = position + 1; i < m_Attributes.size(); i++) {
541 Attribute current = (Attribute)m_Attributes.elementAt(i);
542 current.setIndex(current.index() + 1);
543 }
544 for (int i = 0; i < numInstances(); i++) {
545 instance(i).forceInsertAttributeAt(position);
546 }
547 if (m_ClassIndex >= position) {
548 m_ClassIndex++;
549 }
550 }
551
552 /**
553 * Returns the instance at the given position.
554 *
555 * @param index the instance's index
556 * @return the instance at the given position
557 */
558 public final Instance instance(int index) {
559
560 return (Instance)m_Instances.elementAt(index);
561 }
562
563 /**
564 * Returns the last instance in the set.
565 *
566 * @return the last instance in the set
567 */
568 public final Instance lastInstance() {
569
570 return (Instance)m_Instances.lastElement();
571 }
572
573 /**
574 * Returns the mean (mode) for a numeric (nominal) attribute as
575 * a floating-point value. Returns 0 if the attribute is neither nominal nor
576 * numeric. If all values are missing it returns zero.
577 *
578 * @param attIndex the attribute's index
579 * @return the mean or the mode
580 */
581 public final double meanOrMode(int attIndex) {
582
583 double result, found;
584 int [] counts;
585
586 if (attribute(attIndex).isNumeric()) {
587 result = found = 0;
588 for (int j = 0; j < numInstances(); j++) {
589 if (!instance(j).isMissing(attIndex)) {
590 found += instance(j).weight();
591 result += instance(j).weight()*instance(j).value(attIndex);
592 }
593 }
594 if (Utils.eq(found, 0)) {
595 return 0;
596 } else {
597 return result / found;
598 }
599 } else if (attribute(attIndex).isNominal()) {
600 counts = new int[attribute(attIndex).numValues()];
601 for (int j = 0; j < numInstances(); j++) {
602 if (!instance(j).isMissing(attIndex)) {
603 counts[(int) instance(j).value(attIndex)] += instance(j).weight();
604 }
605 }
606 return (double)Utils.maxIndex(counts);
607 } else {
608 return 0;
609 }
610 }
611
612 /**
613 * Returns the mean (mode) for a numeric (nominal) attribute as a
614 * floating-point value. Returns 0 if the attribute is neither
615 * nominal nor numeric. If all values are missing it returns zero.
616 *
617 * @param att the attribute
618 * @return the mean or the mode
619 */
620 public final double meanOrMode(Attribute att) {
621
622 return meanOrMode(att.index());
623 }
624
625 /**
626 * Returns the number of attributes.
627 *
628 * @return the number of attributes as an integer
629 */
630 public final int numAttributes() {
631
632 return m_Attributes.size();
633 }
634
635 /**
636 * Returns the number of class labels.
637 *
638 * @return the number of class labels as an integer if the class
639 * attribute is nominal, 1 otherwise.
640 * @exception UnassignedClassException if the class is not set
641 */
642 public final int numClasses() {
643
644 if (m_ClassIndex < 0) {
645 throw new UnassignedClassException("Class index is negative (not set)!");
646 }
647 if (!classAttribute().isNominal()) {
648 return 1;
649 } else {
650 return classAttribute().numValues();
651 }
652 }
653
654 /**
655 * Returns the number of distinct values of a given attribute.
656 * Returns the number of instances if the attribute is a
657 * string attribute. The value 'missing' is not counted.
658 *
659 * @param attIndex the attribute
660 * @return the number of distinct values of a given attribute
661 */
662 public final int numDistinctValues(int attIndex) {
663
664 if (attribute(attIndex).isNumeric()) {
665 double [] attVals = attributeToDoubleArray(attIndex);
666 int [] sorted = Utils.sort(attVals);
667 double prev = 0;
668 int counter = 0;
669 for (int i = 0; i < sorted.length; i++) {
670 Instance current = instance(sorted[i]);
671 if (current.isMissing(attIndex)) {
672 break;
673 }
674 if ((i == 0) ||
675 Utils.gr(current.value(attIndex), prev)) {
676 prev = current.value(attIndex);
677 counter++;
678 }
679 }
680 return counter;
681 } else {
682 return attribute(attIndex).numValues();
683 }
684 }
685
686 /**
687 * Returns the number of distinct values of a given attribute.
688 * Returns the number of instances if the attribute is a
689 * string attribute. The value 'missing' is not counted.
690 *
691 * @param att the attribute
692 * @return the number of distinct values of a given attribute
693 */
694 public final int numDistinctValues(Attribute att) {
695
696 return numDistinctValues(att.index());
697 }
698
699 /**
700 * Returns the number of instances in the dataset.
701 *
702 * @return the number of instances in the dataset as an integer
703 */
704 public final int numInstances() {
705
706 return m_Instances.size();
707 }
708
709 /**
710 * Shuffles the instances in the set so that they are ordered
711 * randomly.
712 *
713 * @param random a random number generator
714 */
715 public final void randomize(Random random) {
716
717 for (int j = numInstances() - 1; j > 0; j--)
718 swap(j,(int)(random.nextDouble()*(double)j));
719 }
720
721 /**
722 * Reads a single instance from the reader and appends it
723 * to the dataset. Automatically expands the dataset if it
724 * is not large enough to hold the instance. This method does
725 * not check for carriage return at the end of the line.
726 *
727 * @param reader the reader
728 * @return false if end of file has been reached
729 * @exception IOException if the information is not read
730 * successfully
731 */
732 public final boolean readInstance(Reader reader)
733 throws IOException {
734
735 StreamTokenizer tokenizer = new StreamTokenizer(reader);
736
737 initTokenizer(tokenizer);
738 return getInstance(tokenizer, false);
739 }
740
741 /**
742 * Returns the relation's name.
743 *
744 * @return the relation's name as a string
745 */
746 public final String relationName() {
747
748 return m_RelationName;
749 }
750
751 /**
752 * Renames an attribute. This change only affects this
753 * dataset.
754 *
755 * @param att the attribute's index
756 * @param name the new name
757 */
758 public final void renameAttribute(int att, String name) {
759
760 Attribute newAtt = attribute(att).copy(name);
761 FastVector newVec = new FastVector(numAttributes());
762
763 for (int i = 0; i < numAttributes(); i++) {
764 if (i == att) {
765 newVec.addElement(newAtt);
766 } else {
767 newVec.addElement(attribute(i));
768 }
769 }
770 m_Attributes = newVec;
771 }
772
773 /**
774 * Renames an attribute. This change only affects this
775 * dataset.
776 *
777 * @param att the attribute
778 * @param name the new name
779 */
780 public final void renameAttribute(Attribute att, String name) {
781
782 renameAttribute(att.index(), name);
783 }
784
785 /**
786 * Renames the value of a nominal (or string) attribute value. This
787 * change only affects this dataset.
788 *
789 * @param att the attribute's index
790 * @param val the value's index
791 * @param name the new name
792 */
793 public final void renameAttributeValue(int att, int val, String name) {
794
795 Attribute newAtt = (Attribute)attribute(att).copy();
796 FastVector newVec = new FastVector(numAttributes());
797
798 newAtt.setValue(val, name);
799 for (int i = 0; i < numAttributes(); i++) {
800 if (i == att) {
801 newVec.addElement(newAtt);
802 } else {
803 newVec.addElement(attribute(i));
804 }
805 }
806 m_Attributes = newVec;
807 }
808
809 /**
810 * Renames the value of a nominal (or string) attribute value. This
811 * change only affects this dataset.
812 *
813 * @param att the attribute
814 * @param val the value
815 * @param name the new name
816 */
817 public final void renameAttributeValue(Attribute att, String val,
818 String name) {
819
820 renameAttributeValue(att.index(), att.indexOfValue(val), name);
821 }
822
823 /**
824 * Creates a new dataset of the same size using random sampling
825 * with replacement.
826 *
827 * @param random a random number generator
828 * @return the new dataset
829 */
830 public final Instances resample(Random random) {
831
832 Instances newData = new Instances(this, numInstances());
833 while (newData.numInstances() < numInstances()) {
834 int i = (int) (random.nextDouble() * (double) numInstances());
835 newData.add(instance(i));
836 }
837 return newData;
838 }
839
840 /**
841 * Creates a new dataset of the same size using random sampling
842 * with replacement according to the current instance weights. The
843 * weights of the instances in the new dataset are set to one.
844 *
845 * @param random a random number generator
846 * @return the new dataset
847 */
848 public final Instances resampleWithWeights(Random random) {
849
850 double [] weights = new double[numInstances()];
851 boolean foundOne = false;
852 for (int i = 0; i < weights.length; i++) {
853 weights[i] = instance(i).weight();
854 if (!Utils.eq(weights[i], weights[0])) {
855 foundOne = true;
856 }
857 }
858 if (foundOne) {
859 return resampleWithWeights(random, weights);
860 } else {
861 return new Instances(this);
862 }
863 }
864
865
866 /**
867 * Creates a new dataset of the same size using random sampling
868 * with replacement according to the given weight vector. The
869 * weights of the instances in the new dataset are set to one.
870 * The length of the weight vector has to be the same as the
871 * number of instances in the dataset, and all weights have to
872 * be positive.
873 *
874 * @param random a random number generator
875 * @param weights the weight vector
876 * @return the new dataset
877 * @exception IllegalArgumentException if the weights array is of the wrong
878 * length or contains negative weights.
879 */
880 public final Instances resampleWithWeights(Random random,
881 double[] weights) {
882
883 if (weights.length != numInstances()) {
884 throw new IllegalArgumentException("weights.length != numInstances.");
885 }
886 Instances newData = new Instances(this, numInstances());
887 double[] probabilities = new double[numInstances()];
888 double sumProbs = 0, sumOfWeights = Utils.sum(weights);
889 for (int i = 0; i < numInstances(); i++) {
890 sumProbs += random.nextDouble();
891 probabilities[i] = sumProbs;
892 }
893 Utils.normalize(probabilities, sumProbs / sumOfWeights);
894
895 // Make sure that rounding errors don't mess things up
896 probabilities[numInstances() - 1] = sumOfWeights;
897 int k = 0; int l = 0;
898 sumProbs = 0;
899 while ((k < numInstances() && (l < numInstances()))) {
900 if (weights[l] < 0) {
901 throw new IllegalArgumentException("Weights have to be positive.");
902 }
903 sumProbs += weights[l];
904 while ((k < numInstances()) &&
905 (probabilities[k] <= sumProbs)) {
906 newData.add(instance(l));
907 newData.instance(k).setWeight(1);
908 k++;
909 }
910 l++;
911 }
912 return newData;
913 }
914
915 /**
916 * Sets the class attribute.
917 *
918 * @param att attribute to be the class
919 */
920 public final void setClass(Attribute att) {
921
922 m_ClassIndex = att.index();
923 }
924
925 /**
926 * Sets the class index of the set.
927 * If the class index is negative there is assumed to be no class.
928 * (ie. it is undefined)
929 *
930 * @param classIndex the new class index
931 * @exception IllegalArgumentException if the class index is too big or < 0
932 */
933 public final void setClassIndex(int classIndex) {
934
935 if (classIndex >= numAttributes()) {
936 throw new IllegalArgumentException("Invalid class index: " + classIndex);
937 }
938 m_ClassIndex = classIndex;
939 }
940
941 /**
942 * Sets the relation's name.
943 *
944 * @param newName the new relation name.
945 */
946 public final void setRelationName(String newName) {
947
948 m_RelationName = newName;
949 }
950
951 /**
952 * Sorts the instances based on an attribute. For numeric attributes,
953 * instances are sorted in ascending order. For nominal attributes,
954 * instances are sorted based on the attribute label ordering
955 * specified in the header. Instances with missing values for the
956 * attribute are placed at the end of the dataset.
957 *
958 * @param attIndex the attribute's index
959 */
960 public final void sort(int attIndex) {
961
962 int i,j;
963
964 // move all instances with missing values to end
965 j = numInstances() - 1;
966 i = 0;
967 while (i <= j) {
968 if (instance(j).isMissing(attIndex)) {
969 j--;
970 } else {
971 if (instance(i).isMissing(attIndex)) {
972 swap(i,j);
973 j--;
974 }
975 i++;
976 }
977 }
978 quickSort(attIndex, 0, j);
979 }
980
981 /**
982 * Sorts the instances based on an attribute. For numeric attributes,
983 * instances are sorted into ascending order. For nominal attributes,
984 * instances are sorted based on the attribute label ordering
985 * specified in the header. Instances with missing values for the
986 * attribute are placed at the end of the dataset.
987 *
988 * @param att the attribute
989 */
990 public final void sort(Attribute att) {
991
992 sort(att.index());
993 }
994
995 /**
996 * Stratifies a set of instances according to its class values
997 * if the class attribute is nominal (so that afterwards a
998 * stratified cross-validation can be performed).
999 *
1000 * @param numFolds the number of folds in the cross-validation
1001 * @exception UnassignedClassException if the class is not set
1002 */
1003 public final void stratify(int numFolds) {
1004
1005 if (numFolds <= 0) {
1006 throw new IllegalArgumentException("Number of folds must be greater than 1");
1007 }
1008 if (m_ClassIndex < 0) {
1009 throw new UnassignedClassException("Class index is negative (not set)!");
1010 }
1011 if (classAttribute().isNominal()) {
1012
1013 // sort by class
1014 int index = 1;
1015 while (index < numInstances()) {
1016 Instance instance1 = instance(index - 1);
1017 for (int j = index; j < numInstances(); j++) {
1018 Instance instance2 = instance(j);
1019 if ((instance1.classValue() == instance2.classValue()) ||
1020 (instance1.classIsMissing() &&
1021 instance2.classIsMissing())) {
1022 swap(index,j);
1023 index++;
1024 }
1025 }
1026 index++;
1027 }
1028 stratStep(numFolds);
1029 }
1030 }
1031
1032 /**
1033 * Computes the sum of all the instances' weights.
1034 *
1035 * @return the sum of all the instances' weights as a double
1036 */
1037 public final double sumOfWeights() {
1038
1039 double sum = 0;
1040
1041 for (int i = 0; i < numInstances(); i++) {
1042 sum += instance(i).weight();
1043 }
1044 return sum;
1045 }
1046
1047 /**
1048 * Creates the test set for one fold of a cross-validation on
1049 * the dataset.
1050 *
1051 * @param numFolds the number of folds in the cross-validation. Must
1052 * be greater than 1.
1053 * @param numFold 0 for the first fold, 1 for the second, ...
1054 * @return the test set as a set of weighted instances
1055 * @exception IllegalArgumentException if the number of folds is less than 2
1056 * or greater than the number of instances.
1057 */
1058 public Instances testCV(int numFolds, int numFold) {
1059
1060 int numInstForFold, first, offset;
1061 Instances test;
1062
1063 if (numFolds < 2) {
1064 throw new IllegalArgumentException("Number of folds must be at least 2!");
1065 }
1066 if (numFolds > numInstances()) {
1067 throw new IllegalArgumentException("Can't have more folds than instances!");
1068 }
1069 numInstForFold = numInstances() / numFolds;
1070 if (numFold < numInstances() % numFolds){
1071 numInstForFold++;
1072 offset = numFold;
1073 }else
1074 offset = numInstances() % numFolds;
1075 test = new Instances(this, numInstForFold);
1076 first = numFold * (numInstances() / numFolds) + offset;
1077 copyInstances(first, test, numInstForFold);
1078 return test;
1079 }
1080
1081 /**
1082 * Returns the dataset as a string in ARFF format. Strings
1083 * are quoted if they contain whitespace characters, or if they
1084 * are a question mark.
1085 *
1086 * @return the dataset in ARFF format as a string
1087 */
1088 public final String toString() {
1089
1090 StringBuffer text = new StringBuffer();
1091
1092 text.append("@relation " + Utils.quote(m_RelationName) + "\n\n");
1093 for (int i = 0; i < numAttributes(); i++) {
1094 text.append(attribute(i) + "\n");
1095 }
1096 text.append("\n@data\n");
1097 for (int i = 0; i < numInstances(); i++) {
1098 text.append(instance(i));
1099 if (i < numInstances() - 1) {
1100 text.append('\n');
1101 }
1102 }
1103 return text.toString();
1104 }
1105
1106 /**
1107 * Creates the training set for one fold of a cross-validation
1108 * on the dataset.
1109 *
1110 * @param numFolds the number of folds in the cross-validation. Must
1111 * be greater than 1.
1112 * @param numFold 0 for the first fold, 1 for the second, ...
1113 * @return the training set as a set of weighted
1114 * instances
1115 * @exception IllegalArgumentException if the number of folds is less than 2
1116 * or greater than the number of instances.
1117 */
1118 public Instances trainCV(int numFolds, int numFold) {
1119
1120 int numInstForFold, first, offset;
1121 Instances train;
1122
1123 if (numFolds < 2) {
1124 throw new IllegalArgumentException("Number of folds must be at least 2!");
1125 }
1126 if (numFolds > numInstances()) {
1127 throw new IllegalArgumentException("Can't have more folds than instances!");
1128 }
1129 numInstForFold = numInstances() / numFolds;
1130 if (numFold < numInstances() % numFolds) {
1131 numInstForFold++;
1132 offset = numFold;
1133 }else
1134 offset = numInstances() % numFolds;
1135 train = new Instances(this, numInstances() - numInstForFold);
1136 first = numFold * (numInstances() / numFolds) + offset;
1137 copyInstances(0, train, first);
1138 copyInstances(first + numInstForFold, train,
1139 numInstances() - first - numInstForFold);
1140
1141 return train;
1142 }
1143
1144 /**
1145 * Computes the variance for a numeric attribute.
1146 *
1147 * @param attIndex the numeric attribute
1148 * @return the variance if the attribute is numeric
1149 * @exception IllegalArgumentException if the attribute is not numeric
1150 */
1151 public final double variance(int attIndex) {
1152
1153 double sum = 0, sumSquared = 0, sumOfWeights = 0;
1154
1155 if (!attribute(attIndex).isNumeric()) {
1156 throw new IllegalArgumentException("Can't compute variance because attribute is " +
1157 "not numeric!");
1158 }
1159 for (int i = 0; i < numInstances(); i++) {
1160 if (!instance(i).isMissing(attIndex)) {
1161 sum += instance(i).weight() *
1162 instance(i).value(attIndex);
1163 sumSquared += instance(i).weight() *
1164 instance(i).value(attIndex) *
1165 instance(i).value(attIndex);
1166 sumOfWeights += instance(i).weight();
1167 }
1168 }
1169 if (Utils.smOrEq(sumOfWeights, 1)) {
1170 return 0;
1171 }
1172 return (sumSquared - (sum * sum / sumOfWeights)) /
1173 (sumOfWeights - 1);
1174 }
1175
1176 /**
1177 * Computes the variance for a numeric attribute.
1178 *
1179 * @param att the numeric attribute
1180 * @return the variance if the attribute is numeric
1181 * @exception IllegalArgumentException if the attribute is not numeric
1182 */
1183 public final double variance(Attribute att) {
1184
1185 return variance(att.index());
1186 }
1187
1188 /**
1189 * Calculates summary statistics on the values that appear in this
1190 * set of instances for a specified attribute.
1191 *
1192 * @param index the index of the attribute to summarize.
1193 * @return an AttributeStats object with it's fields calculated.
1194 */
1195 public AttributeStats attributeStats(int index) {
1196
1197 AttributeStats result = new AttributeStats();
1198 if (attribute(index).isNominal()) {
1199 result.nominalCounts = new int [attribute(index).numValues()];
1200 }
1201 if (attribute(index).isNumeric()) {
1202 result.numericStats = new weka.experiment.Stats();
1203 }
1204 result.totalCount = numInstances();
1205
1206 double [] attVals = attributeToDoubleArray(index);
1207 int [] sorted = Utils.sort(attVals);
1208 int currentCount = 0;
1209 double prev = Instance.missingValue();
1210 for (int j = 0; j < numInstances(); j++) {
1211 Instance current = instance(sorted[j]);
1212 if (current.isMissing(index)) {
1213 result.missingCount = numInstances() - j;
1214 break;
1215 }
1216 if (Utils.eq(current.value(index), prev)) {
1217 currentCount++;
1218 } else {
1219 result.addDistinct(prev, currentCount);
1220 currentCount = 1;
1221 prev = current.value(index);
1222 }
1223 }
1224 result.addDistinct(prev, currentCount);
1225 result.distinctCount--; // So we don't count "missing" as a value
1226 return result;
1227 }
1228
1229 /**
1230 * Gets the value of all instances in this dataset for a particular
1231 * attribute. Useful in conjunction with Utils.sort to allow iterating
1232 * through the dataset in sorted order for some attribute.
1233 *
1234 * @param index the index of the attribute.
1235 * @return an array containing the value of the desired attribute for
1236 * each instance in the dataset.
1237 */
1238 public double [] attributeToDoubleArray(int index) {
1239
1240 double [] result = new double[numInstances()];
1241 for (int i = 0; i < result.length; i++) {
1242 result[i] = instance(i).value(index);
1243 }
1244 return result;
1245 }
1246
1247 /**
1248 * Generates a string summarizing the set of instances. Gives a breakdown
1249 * for each attribute indicating the number of missing/discrete/unique
1250 * values and other information.
1251 *
1252 * @return a string summarizing the dataset
1253 */
1254 public String toSummaryString() {
1255
1256 StringBuffer result = new StringBuffer();
1257 result.append("Relation Name: ").append(relationName()).append('\n');
1258 result.append("Num Instances: ").append(numInstances()).append('\n');
1259 result.append("Num Attributes: ").append(numAttributes()).append('\n');
1260 result.append('\n');
1261
1262 result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
1263 result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
1264 result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
1265 result.append(Utils.padLeft("Missing", 12));
1266 result.append(Utils.padLeft("Unique", 12));
1267 result.append(Utils.padLeft("Dist", 6)).append('\n');
1268 for (int i = 0; i < numAttributes(); i++) {
1269 Attribute a = attribute(i);
1270 AttributeStats as = attributeStats(i);
1271 result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
1272 result.append(Utils.padRight(a.name(), 25)).append(' ');
1273 long percent;
1274 switch (a.type()) {
1275 case Attribute.NOMINAL:
1276 result.append(Utils.padLeft("Nom", 4)).append(' ');
1277 percent = Math.round(100.0 * as.intCount / as.totalCount);
1278 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1279 result.append(Utils.padLeft("" + 0, 3)).append("% ");
1280 percent = Math.round(100.0 * as.realCount / as.totalCount);
1281 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1282 break;
1283 case Attribute.NUMERIC:
1284 result.append(Utils.padLeft("Num", 4)).append(' ');
1285 result.append(Utils.padLeft("" + 0, 3)).append("% ");
1286 percent = Math.round(100.0 * as.intCount / as.totalCount);
1287 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1288 percent = Math.round(100.0 * as.realCount / as.totalCount);
1289 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1290 break;
1291 case Attribute.STRING:
1292 result.append(Utils.padLeft("Str", 4)).append(' ');
1293 percent = Math.round(100.0 * as.intCount / as.totalCount);
1294 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1295 result.append(Utils.padLeft("" + 0, 3)).append("% ");
1296 percent = Math.round(100.0 * as.realCount / as.totalCount);
1297 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1298 break;
1299 default:
1300 result.append(Utils.padLeft("???", 4)).append(' ');
1301 result.append(Utils.padLeft("" + 0, 3)).append("% ");
1302 percent = Math.round(100.0 * as.intCount / as.totalCount);
1303 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1304 percent = Math.round(100.0 * as.realCount / as.totalCount);
1305 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1306 break;
1307 }
1308 result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
1309 percent = Math.round(100.0 * as.missingCount / as.totalCount);
1310 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1311 result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
1312 percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
1313 result.append(Utils.padLeft("" + percent, 3)).append("% ");
1314 result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
1315 result.append('\n');
1316 }
1317 return result.toString();
1318 }
1319
1320 /**
1321 * Reads a single instance using the tokenizer and appends it
1322 * to the dataset. Automatically expands the dataset if it
1323 * is not large enough to hold the instance.
1324 *
1325 * @param tokenizer the tokenizer to be used
1326 * @param flag if method should test for carriage return after
1327 * each instance
1328 * @return false if end of file has been reached
1329 * @exception IOException if the information is not read
1330 * successfully
1331 */
1332 protected boolean getInstance(StreamTokenizer tokenizer,
1333 boolean flag)
1334 throws IOException {
1335
1336 // Check if any attributes have been declared.
1337 if (m_Attributes.size() == 0) {
1338 errms(tokenizer,"no header information available");
1339 }
1340
1341 // Check if end of file reached.
1342 getFirstToken(tokenizer);
1343 if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1344 return false;
1345 }
1346
1347 // Parse instance
1348 if (tokenizer.ttype == '{') {
1349 return getInstanceSparse(tokenizer, flag);
1350 } else {
1351 return getInstanceFull(tokenizer, flag);
1352 }
1353 }
1354
1355 /**
1356 * Reads a single instance using the tokenizer and appends it
1357 * to the dataset. Automatically expands the dataset if it
1358 * is not large enough to hold the instance.
1359 *
1360 * @param tokenizer the tokenizer to be used
1361 * @param flag if method should test for carriage return after
1362 * each instance
1363 * @return false if end of file has been reached
1364 * @exception IOException if the information is not read
1365 * successfully
1366 */
1367 protected boolean getInstanceSparse(StreamTokenizer tokenizer,
1368 boolean flag)
1369 throws IOException {
1370
1371 int valIndex, numValues = 0, maxIndex = -1;
1372
1373 // Get values
1374 do {
1375
1376 // Get index
1377 getIndex(tokenizer);
1378 if (tokenizer.ttype == '}') {
1379 break;
1380 }
1381
1382 // Is index valid?
1383 try{
1384 m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
1385 } catch (NumberFormatException e) {
1386 errms(tokenizer,"index number expected");
1387 }
1388 if (m_IndicesBuffer[numValues] <= maxIndex) {
1389 errms(tokenizer,"indices have to be ordered");
1390 }
1391 if ((m_IndicesBuffer[numValues] < 0) ||
1392 (m_IndicesBuffer[numValues] >= numAttributes())) {
1393 errms(tokenizer,"index out of bounds");
1394 }
1395 maxIndex = m_IndicesBuffer[numValues];
1396
1397 // Get value;
1398 getNextToken(tokenizer);
1399
1400 // Check if value is missing.
1401 if (tokenizer.ttype == '?') {
1402 m_ValueBuffer[numValues] = Instance.missingValue();
1403 } else {
1404
1405 // Check if token is valid.
1406 if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
1407 errms(tokenizer,"not a valid value");
1408 }
1409 if (attribute(m_IndicesBuffer[numValues]).isNominal()) {
1410
1411 // Check if value appears in header.
1412 valIndex =
1413 attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
1414 if (valIndex == -1) {
1415 errms(tokenizer,"nominal value not declared in header");
1416 }
1417 m_ValueBuffer[numValues] = (double)valIndex;
1418 } else if (attribute(m_IndicesBuffer[numValues]).isNumeric()) {
1419
1420 // Check if value is really a number.
1421 try{
1422 m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
1423 doubleValue();
1424 } catch (NumberFormatException e) {
1425 errms(tokenizer,"number expected");
1426 }
1427 } else {
1428 m_ValueBuffer[numValues] =
1429 attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
1430 }
1431 }
1432 numValues++;
1433 } while (true);
1434 if (flag) {
1435 getLastToken(tokenizer,true);
1436 }
1437
1438 // Add instance to dataset
1439 double[] tempValues = new double[numValues];
1440 int[] tempIndices = new int[numValues];
1441 System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
1442 System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
1443 add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
1444 return true;
1445 }
1446
1447 /**
1448 * Reads a single instance using the tokenizer and appends it
1449 * to the dataset. Automatically expands the dataset if it
1450 * is not large enough to hold the instance.
1451 *
1452 * @param tokenizer the tokenizer to be used
1453 * @param flag if method should test for carriage return after
1454 * each instance
1455 * @return false if end of file has been reached
1456 * @exception IOException if the information is not read
1457 * successfully
1458 */
1459 protected boolean getInstanceFull(StreamTokenizer tokenizer,
1460 boolean flag)
1461 throws IOException {
1462
1463 double[] instance = new double[numAttributes()];
1464 int index;
1465
1466 // Get values for all attributes.
1467 for (int i = 0; i < numAttributes(); i++){
1468
1469 // Get next token
1470 if (i > 0) {
1471 getNextToken(tokenizer);
1472 }
1473
1474 // Check if value is missing.
1475 if (tokenizer.ttype == '?') {
1476 instance[i] = Instance.missingValue();
1477 } else {
1478
1479 // Check if token is valid.
1480 if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
1481 errms(tokenizer,"not a valid value");
1482 }
1483 if (attribute(i).isNominal()) {
1484
1485 // Check if value appears in header.
1486 index = attribute(i).indexOfValue(tokenizer.sval);
1487 if (index == -1) {
1488 errms(tokenizer,"nominal value not declared in header");
1489 }
1490 instance[i] = (double)index;
1491 } else if (attribute(i).isNumeric()) {
1492
1493 // Check if value is really a number.
1494 try{
1495 instance[i] = Double.valueOf(tokenizer.sval).
1496 doubleValue();
1497 } catch (NumberFormatException e) {
1498 errms(tokenizer,"number expected");
1499 }
1500 } else {
1501 instance[i] = attribute(i).addStringValue(tokenizer.sval);
1502 }
1503 }
1504 }
1505 if (flag) {
1506 getLastToken(tokenizer,true);
1507 }
1508
1509 // Add instance to dataset
1510 add(new Instance(1, instance));
1511 return true;
1512 }
1513
1514 /**
1515 * Reads and stores header of an ARFF file.
1516 *
1517 * @param tokenizer the stream tokenizer
1518 * @exception IOException if the information is not read
1519 * successfully
1520 */
1521 protected void readHeader(StreamTokenizer tokenizer)
1522 throws IOException{
1523
1524 String attributeName;
1525 FastVector attributeValues;
1526 int i;
1527
1528 // Get name of relation.
1529 getFirstToken(tokenizer);
1530 if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1531 errms(tokenizer,"premature end of file");
1532 }
1533 if (tokenizer.sval.equalsIgnoreCase("@relation")){
1534 getNextToken(tokenizer);
1535 m_RelationName = tokenizer.sval;
1536 getLastToken(tokenizer,false);
1537 } else {
1538 errms(tokenizer,"keyword @relation expected");
1539 }
1540
1541 // Create vectors to hold information temporarily.
1542 m_Attributes = new FastVector();
1543
1544 // Get attribute declarations.
1545 getFirstToken(tokenizer);
1546 if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1547 errms(tokenizer,"premature end of file");
1548 }
1549 while (tokenizer.sval.equalsIgnoreCase("@attribute")) {
1550
1551 // Get attribute name.
1552 getNextToken(tokenizer);
1553 attributeName = tokenizer.sval;
1554 getNextToken(tokenizer);
1555
1556 // Check if attribute is nominal.
1557 if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
1558
1559 // Attribute is real, integer, or string.
1560 if (tokenizer.sval.equalsIgnoreCase("real") ||
1561 tokenizer.sval.equalsIgnoreCase("integer") ||
1562 tokenizer.sval.equalsIgnoreCase("numeric")) {
1563 m_Attributes.addElement(new Attribute(attributeName,
1564 numAttributes()));
1565 readTillEOL(tokenizer);
1566 } else if (tokenizer.sval.equalsIgnoreCase("string")) {
1567 m_Attributes.
1568 addElement(new Attribute(attributeName, null,
1569 numAttributes()));
1570 readTillEOL(tokenizer);
1571 } else {
1572 errms(tokenizer,"no valid attribute type or invalid "+
1573 "enumeration");
1574 }
1575 } else {
1576
1577 // Attribute is nominal.
1578 attributeValues = new FastVector();
1579 tokenizer.pushBack();
1580
1581 // Get values for nominal attribute.
1582 if (tokenizer.nextToken() != '{') {
1583 errms(tokenizer,"{ expected at beginning of enumeration");
1584 }
1585 while (tokenizer.nextToken() != '}') {
1586 if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
1587 errms(tokenizer,"} expected at end of enumeration");
1588 } else {
1589 attributeValues.addElement(tokenizer.sval);
1590 }
1591 }
1592 if (attributeValues.size() == 0) {
1593 errms(tokenizer,"no nominal values found");
1594 }
1595 m_Attributes.
1596 addElement(new Attribute(attributeName, attributeValues,
1597 numAttributes()));
1598 }
1599 getLastToken(tokenizer,false);
1600 getFirstToken(tokenizer);
1601 if (tokenizer.ttype == StreamTokenizer.TT_EOF)
1602 errms(tokenizer,"premature end of file");
1603 }
1604
1605 // Check if data part follows. We can't easily check for EOL.
1606 if (!tokenizer.sval.equalsIgnoreCase("@data")) {
1607 errms(tokenizer,"keyword @data expected");
1608 }
1609
1610 // Check if any attributes have been declared.
1611 if (m_Attributes.size() == 0) {
1612 errms(tokenizer,"no attributes declared");
1613 }
1614
1615 // Allocate buffers in case sparse instances have to be read
1616 m_ValueBuffer = new double[numAttributes()];
1617 m_IndicesBuffer = new int[numAttributes()];
1618 }
1619
1620 /**
1621 * Copies instances from one set to the end of another
1622 * one.
1623 *
1624 * @param source the source of the instances
1625 * @param from the position of the first instance to be copied
1626 * @param dest the destination for the instances
1627 * @param num the number of instances to be copied
1628 */
1629 private void copyInstances(int from, Instances dest, int num) {
1630
1631 for (int i = 0; i < num; i++) {
1632 dest.add(instance(from + i));
1633 }
1634 }
1635
1636 /**
1637 * Throws error message with line number and last token read.
1638 *
1639 * @param theMsg the error message to be thrown
1640 * @param tokenizer the stream tokenizer
1641 * @throws IOExcpetion containing the error message
1642 */
1643 private void errms(StreamTokenizer tokenizer, String theMsg)
1644 throws IOException {
1645
1646 throw new IOException(theMsg + ", read " + tokenizer.toString());
1647 }
1648
1649 /**
1650 * Replaces the attribute information by a clone of
1651 * itself.
1652 */
1653 private void freshAttributeInfo() {
1654
1655 m_Attributes = (FastVector) m_Attributes.copyElements();
1656 }
1657
1658 /**
1659 * Gets next token, skipping empty lines.
1660 *
1661 * @param tokenizer the stream tokenizer
1662 * @exception IOException if reading the next token fails
1663 */
1664 private void getFirstToken(StreamTokenizer tokenizer)
1665 throws IOException{
1666
1667 while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};
1668 if ((tokenizer.ttype == '\'') ||
1669 (tokenizer.ttype == '"')) {
1670 tokenizer.ttype = StreamTokenizer.TT_WORD;
1671 } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
1672 (tokenizer.sval.equals("?"))){
1673 tokenizer.ttype = '?';
1674 }
1675 }
1676
1677 /**
1678 * Gets index, checking for a premature and of line.
1679 *
1680 * @param tokenizer the stream tokenizer
1681 * @exception IOException if it finds a premature end of line
1682 */
1683 private void getIndex(StreamTokenizer tokenizer) throws IOException{
1684
1685 if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
1686 errms(tokenizer,"premature end of line");
1687 }
1688 if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1689 errms(tokenizer,"premature end of file");
1690 }
1691 }
1692
1693 /**
1694 * Gets token and checks if its end of line.
1695 *
1696 * @param tokenizer the stream tokenizer
1697 * @exception IOException if it doesn't find an end of line
1698 */
1699 private void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)
1700 throws IOException{
1701
1702 if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
1703 ((tokenizer.nextToken() != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
1704 errms(tokenizer,"end of line expected");
1705 }
1706 }
1707
1708 /**
1709 * Gets next token, checking for a premature and of line.
1710 *
1711 * @param tokenizer the stream tokenizer
1712 * @exception IOException if it finds a premature end of line
1713 */
1714 private void getNextToken(StreamTokenizer tokenizer)
1715 throws IOException{
1716
1717 if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
1718 errms(tokenizer,"premature end of line");
1719 }
1720 if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1721 errms(tokenizer,"premature end of file");
1722 } else if ((tokenizer.ttype == '\'') ||
1723 (tokenizer.ttype == '"')) {
1724 tokenizer.ttype = StreamTokenizer.TT_WORD;
1725 } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
1726 (tokenizer.sval.equals("?"))){
1727 tokenizer.ttype = '?';
1728 }
1729 }
1730
1731 /**
1732 * Initializes the StreamTokenizer used for reading the ARFF file.
1733 *
1734 * @param tokenizer the stream tokenizer
1735 */
1736 private void initTokenizer(StreamTokenizer tokenizer){
1737
1738 tokenizer.resetSyntax();
1739 tokenizer.whitespaceChars(0, ' ');
1740 tokenizer.wordChars(' '+1,'\u00FF');
1741 tokenizer.whitespaceChars(',',',');
1742 tokenizer.commentChar('%');
1743 tokenizer.quoteChar('"');
1744 tokenizer.quoteChar('\'');
1745 tokenizer.ordinaryChar('{');
1746 tokenizer.ordinaryChar('}');
1747 tokenizer.eolIsSignificant(true);
1748 }
1749
1750 /**
1751 * Returns string including all instances, their weights and
1752 * their indices in the original dataset.
1753 *
1754 * @return description of instance and its weight as a string
1755 */
1756 private String instancesAndWeights(){
1757
1758 StringBuffer text = new StringBuffer();
1759
1760 for (int i = 0; i < numInstances(); i++) {
1761 text.append(instance(i) + " " + instance(i).weight());
1762 if (i < numInstances() - 1) {
1763 text.append("\n");
1764 }
1765 }
1766 return text.toString();
1767 }
1768
1769 /**
1770 * Implements quicksort.
1771 *
1772 * @param attIndex the attribute's index
1773 * @param lo0 the first index of the subset to be sorted
1774 * @param hi0 the last index of the subset to be sorted
1775 */
1776 private void quickSort(int attIndex, int lo0, int hi0) {
1777
1778 int lo = lo0, hi = hi0;
1779 double mid, midPlus, midMinus;
1780
1781 if (hi0 > lo0) {
1782
1783 // Arbitrarily establishing partition element as the
1784 // midpoint of the array.
1785 mid = instance((lo0 + hi0) / 2).value(attIndex);
1786 midPlus = mid + 1e-6;
1787 midMinus = mid - 1e-6;
1788
1789 // loop through the array until indices cross
1790 while(lo <= hi) {
1791
1792 // find the first element that is greater than or equal to
1793 // the partition element starting from the left Index.
1794 while ((instance(lo).value(attIndex) <
1795 midMinus) && (lo < hi0)) {
1796 ++lo;
1797 }
1798
1799 // find an element that is smaller than or equal to
1800 // the partition element starting from the right Index.
1801 while ((instance(hi).value(attIndex) >
1802 midPlus) && (hi > lo0)) {
1803 --hi;
1804 }
1805
1806 // if the indexes have not crossed, swap
1807 if(lo <= hi) {
1808 swap(lo,hi);
1809 ++lo;
1810 --hi;
1811 }
1812 }
1813
1814 // If the right index has not reached the left side of array
1815 // must now sort the left partition.
1816 if(lo0 < hi) {
1817 quickSort(attIndex,lo0,hi);
1818 }
1819
1820 // If the left index has not reached the right side of array
1821 // must now sort the right partition.
1822 if(lo < hi0) {
1823 quickSort(attIndex,lo,hi0);
1824 }
1825 }
1826 }
1827
1828 /**
1829 * Reads and skips all tokens before next end of line token.
1830 *
1831 * @param tokenizer the stream tokenizer
1832 */
1833 private void readTillEOL(StreamTokenizer tokenizer)
1834 throws IOException{
1835
1836 while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
1837 tokenizer.pushBack();
1838 }
1839
1840 /**
1841 * Help function needed for stratification of set.
1842 *
1843 * @param numFolds the number of folds for the stratification
1844 */
1845 private void stratStep (int numFolds){
1846
1847 FastVector newVec = new FastVector(m_Instances.capacity());
1848 int start = 0, j;
1849
1850 // create stratified batch
1851 while (newVec.size() < numInstances()) {
1852 j = start;
1853 while (j < numInstances()) {
1854 newVec.addElement(instance(j));
1855 j = j + numFolds;
1856 }
1857 start++;
1858 }
1859 m_Instances = newVec;
1860 }
1861
1862 /**
1863 * Swaps two instances in the set.
1864 *
1865 * @param i the first instance's index
1866 * @param j the second instance's index
1867 */
1868 private void swap(int i, int j){
1869
1870 m_Instances.swap(i, j);
1871 }
1872
1873 /**
1874 * Merges two sets of Instances together. The resulting set will have
1875 * all the attributes of the first set plus all the attributes of the
1876 * second set. The number of instances in both sets must be the same.
1877 *
1878 * @param first the first set of Instances
1879 * @param second the second set of Instances
1880 * @return the merged set of Instances
1881 * @exception IllegalArgumentException if the datasets are not the same size
1882 */
1883 public static Instances mergeInstances(Instances first, Instances second) {
1884
1885 if (first.numInstances() != second.numInstances()) {
1886 throw new IllegalArgumentException("Instance sets must be of the same size");
1887 }
1888
1889 // Create the vector of merged attributes
1890 FastVector newAttributes = new FastVector();
1891 for (int i = 0; i < first.numAttributes(); i++) {
1892 newAttributes.addElement(first.attribute(i));
1893 }
1894 for (int i = 0; i < second.numAttributes(); i++) {
1895 newAttributes.addElement(second.attribute(i));
1896 }
1897
1898 // Create the set of Instances
1899 Instances merged = new Instances(first.relationName() + '_'
1900 + second.relationName(),
1901 newAttributes,
1902 first.numInstances());
1903 // Merge each instance
1904 for (int i = 0; i < first.numInstances(); i++) {
1905 merged.add(first.instance(i).mergeInstance(second.instance(i)));
1906 }
1907 return merged;
1908 }
1909
1910 /**
1911 * Method for testing this class.
1912 *
1913 * @param argv should contain one element: the name of an ARFF file
1914 */
1915 public static void test(String [] argv) {
1916
1917 Instances instances, secondInstances, train, test, transformed, empty;
1918 Instance instance;
1919 Random random = new Random(2);
1920 Reader reader;
1921 int start, num;
1922 double newWeight;
1923 FastVector testAtts, testVals;
1924 int i,j;
1925
1926 try{
1927 if (argv.length > 1) {
1928 throw (new Exception("Usage: Instances [<filename>]"));
1929 }
1930
1931 // Creating set of instances from scratch
1932 testVals = new FastVector(2);
1933 testVals.addElement("first_value");
1934 testVals.addElement("second_value");
1935 testAtts = new FastVector(2);
1936 testAtts.addElement(new Attribute("nominal_attribute", testVals));
1937 testAtts.addElement(new Attribute("numeric_attribute"));
1938 instances = new Instances("test_set", testAtts, 10);
1939 instances.add(new Instance(instances.numAttributes()));
1940 instances.add(new Instance(instances.numAttributes()));
1941 instances.add(new Instance(instances.numAttributes()));
1942 instances.setClassIndex(0);
1943 System.out.println("\nSet of instances created from scratch:\n");
1944 System.out.println(instances);
1945
1946 if (argv.length == 1) {
1947 String filename = argv[0];
1948 reader = new FileReader(filename);
1949
1950 // Read first five instances and print them
1951 System.out.println("\nFirst five instances from file:\n");
1952 instances = new Instances(reader, 1);
1953 instances.setClassIndex(instances.numAttributes() - 1);
1954 i = 0;
1955 while ((i < 5) && (instances.readInstance(reader))) {
1956 i++;
1957 }
1958 System.out.println(instances);
1959
1960 // Read all the instances in the file
1961 reader = new FileReader(filename);
1962 instances = new Instances(reader);
1963
1964 // Make the last attribute be the class
1965 instances.setClassIndex(instances.numAttributes() - 1);
1966
1967 // Print header and instances.
1968 System.out.println("\nDataset:\n");
1969 System.out.println(instances);
1970 System.out.println("\nClass index: "+instances.classIndex());
1971 }
1972
1973 // Test basic methods based on class index.
1974 System.out.println("\nClass name: "+instances.classAttribute().name());
1975 System.out.println("\nClass index: "+instances.classIndex());
1976 System.out.println("\nClass is nominal: " +
1977 instances.classAttribute().isNominal());
1978 System.out.println("\nClass is numeric: " +
1979 instances.classAttribute().isNumeric());
1980 System.out.println("\nClasses:\n");
1981 for (i = 0; i < instances.numClasses(); i++) {
1982 System.out.println(instances.classAttribute().value(i));
1983 }
1984 System.out.println("\nClass values and labels of instances:\n");
1985 for (i = 0; i < instances.numInstances(); i++) {
1986 Instance inst = instances.instance(i);
1987 System.out.print(inst.classValue() + "\t");
1988 System.out.print(inst.toString(inst.classIndex()));
1989 if (instances.instance(i).classIsMissing()) {
1990 System.out.println("\tis missing");
1991 } else {
1992 System.out.println();
1993 }
1994 }
1995
1996 // Create random weights.
1997 System.out.println("\nCreating random weights for instances.");
1998 for (i = 0; i < instances.numInstances(); i++) {
1999 instances.instance(i).setWeight(random.nextDouble());
2000 }
2001
2002 // Print all instances and their weights (and the sum of weights).
2003 System.out.println("\nInstances and their weights:\n");
2004 System.out.println(instances.instancesAndWeights());
2005 System.out.print("\nSum of weights: ");
2006 System.out.println(instances.sumOfWeights());
2007
2008 // Insert an attribute
2009 secondInstances = new Instances(instances);
2010 Attribute testAtt = new Attribute("Inserted");
2011 secondInstances.insertAttributeAt(testAtt, 0);
2012 System.out.println("\nSet with inserted attribute:\n");
2013 System.out.println(secondInstances);
2014 System.out.println("\nClass name: "
2015 + secondInstances.classAttribute().name());
2016
2017 // Delete the attribute
2018 secondInstances.deleteAttributeAt(0);
2019 System.out.println("\nSet with attribute deleted:\n");
2020 System.out.println(secondInstances);
2021 System.out.println("\nClass name: "
2022 + secondInstances.classAttribute().name());
2023
2024 // Test if headers are equal
2025 System.out.println("\nHeaders equal: "+
2026 instances.equalHeaders(secondInstances) + "\n");
2027
2028 // Print data in internal format.
2029 System.out.println("\nData (internal values):\n");
2030 for (i = 0; i < instances.numInstances(); i++) {
2031 for (j = 0; j < instances.numAttributes(); j++) {
2032 if (instances.instance(i).isMissing(j)) {
2033 System.out.print("? ");
2034 } else {
2035 System.out.print(instances.instance(i).value(j) + " ");
2036 }
2037 }
2038 System.out.println();
2039 }
2040
2041 // Just print header
2042 System.out.println("\nEmpty dataset:\n");
2043 empty = new Instances(instances, 0);
2044 System.out.println(empty);
2045 System.out.println("\nClass name: "+empty.classAttribute().name());
2046
2047 // Create copy and rename an attribute and a value (if possible)
2048 if (empty.classAttribute().isNominal()) {
2049 Instances copy = new Instances(empty, 0);
2050 copy.renameAttribute(copy.classAttribute(), "new_name");
2051 copy.renameAttributeValue(copy.classAttribute(),
2052 copy.classAttribute().value(0),
2053 "new_val_name");
2054 System.out.println("\nDataset with names changed:\n" + copy);
2055 System.out.println("\nOriginal dataset:\n" + empty);
2056 }
2057
2058 // Create and prints subset of instances.
2059 start = instances.numInstances() / 4;
2060 num = instances.numInstances() / 2;
2061 System.out.print("\nSubset of dataset: ");
2062 System.out.println(num + " instances from " + (start + 1)
2063 + ". instance");
2064 secondInstances = new Instances(instances, start, num);
2065 System.out.println("\nClass name: "
2066 + secondInstances.classAttribute().name());
2067
2068 // Print all instances and their weights (and the sum of weights).
2069 System.out.println("\nInstances and their weights:\n");
2070 System.out.println(secondInstances.instancesAndWeights());
2071 System.out.print("\nSum of weights: ");
2072 System.out.println(secondInstances.sumOfWeights());
2073
2074 // Create and print training and test sets for 3-fold
2075 // cross-validation.
2076 System.out.println("\nTrain and test folds for 3-fold CV:");
2077 if (instances.classAttribute().isNominal()) {
2078 instances.stratify(3);
2079 }
2080 for (j = 0; j < 3; j++) {
2081 train = instances.trainCV(3,j);
2082 test = instances.testCV(3,j);
2083
2084 // Print all instances and their weights (and the sum of weights).
2085 System.out.println("\nTrain: ");
2086 System.out.println("\nInstances and their weights:\n");
2087 System.out.println(train.instancesAndWeights());
2088 System.out.print("\nSum of weights: ");
2089 System.out.println(train.sumOfWeights());
2090 System.out.println("\nClass name: "+train.classAttribute().name());
2091 System.out.println("\nTest: ");
2092 System.out.println("\nInstances and their weights:\n");
2093 System.out.println(test.instancesAndWeights());
2094 System.out.print("\nSum of weights: ");
2095 System.out.println(test.sumOfWeights());
2096 System.out.println("\nClass name: "+test.classAttribute().name());
2097 }
2098
2099 // Randomize instances and print them.
2100 System.out.println("\nRandomized dataset:");
2101 instances.randomize(random);
2102
2103 // Print all instances and their weights (and the sum of weights).
2104 System.out.println("\nInstances and their weights:\n");
2105 System.out.println(instances.instancesAndWeights());
2106 System.out.print("\nSum of weights: ");
2107 System.out.println(instances.sumOfWeights());
2108
2109 // Sort instances according to first attribute and
2110 // print them.
2111 System.out.print("\nInstances sorted according to first attribute:\n ");
2112 instances.sort(0);
2113
2114 // Print all instances and their weights (and the sum of weights).
2115 System.out.println("\nInstances and their weights:\n");
2116 System.out.println(instances.instancesAndWeights());
2117 System.out.print("\nSum of weights: ");
2118 System.out.println(instances.sumOfWeights());
2119 } catch (Exception e) {
2120 e.printStackTrace();
2121 }
2122 }
2123
2124 /**
2125 * Main method for this class -- just prints a summary of a set
2126 * of instances.
2127 *
2128 * @param argv should contain one element: the name of an ARFF file
2129 */
2130 public static void main(String [] args) {
2131
2132 try {
2133 Reader r = null;
2134 if (args.length > 1) {
2135 throw (new Exception("Usage: Instances <filename>"));
2136 } else if (args.length == 0) {
2137 r = new BufferedReader(new InputStreamReader(System.in));
2138 } else {
2139 r = new BufferedReader(new FileReader(args[0]));
2140 }
2141 Instances i = new Instances(r);
2142 System.out.println(i.toSummaryString());
2143 } catch (Exception ex) {
2144 System.err.println(ex.getMessage());
2145 }
2146 }
2147}
2148
2149
2150
Note: See TracBrowser for help on using the repository browser.