Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/weka/core/Instances.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 63.2 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* Instances.java
19	* Copyright (C) 1999 Eibe Frank
20	*
21	*/
22
23	package weka.core;
24
25	import java.io.*;
26	import java.util.*;
27
28	/**
29	* Class for handling an ordered set of weighted instances. <p>
30	*
31	* Typical usage (code from the main() method of this class): <p>
32	*
33	* <code>
34	* ... <br>
35	*
36	* // Read all the instances in the file <br>
37	* reader = new FileReader(filename); <br>
38	* instances = new Instances(reader); <br><br>
39	*
40	* // Make the last attribute be the class <br>
41	* instances.setClassIndex(instances.numAttributes() - 1); <br><br>
42	*
43	* // Print header and instances. <br>
44	* System.out.println("\nDataset:\n"); <br>
45	* System.out.println(instances); <br><br>
46	*
47	* ... <br>
48	* </code><p>
49	*
50	* All methods that change a set of instances are safe, ie. a change
51	* of a set of instances does not affect any other sets of
52	* instances. All methods that change a datasets's attribute
53	* information clone the dataset before it is changed.
54	*
55	* @author Eibe Frank ([email protected])
56	* @author Len Trigg ([email protected])
57	* @version $Revision: 8815 $
58	*/
59	public class Instances implements Serializable {
60
61	/** The filename extension that should be used for arff files */
62	public static String FILE_EXTENSION = ".arff";
63
64	/** The dataset's name. */
65	protected String m_RelationName;
66
67	/** The attribute information. */
68	protected FastVector m_Attributes;
69
70	/** The instances. */
71	protected FastVector m_Instances;
72
73	/** The class attribute's index */
74	protected int m_ClassIndex;
75
76	/** Buffer of values for sparse instance */
77	protected double[] m_ValueBuffer;
78
79	/** Buffer of indices for sparse instance */
80	protected int[] m_IndicesBuffer;
81
82	/**
83	* Reads an ARFF file from a reader, and assigns a weight of
84	* one to each instance. Lets the index of the class
85	* attribute be undefined (negative).
86	*
87	* @param reader the reader
88	* @exception IOException if the ARFF file is not read
89	* successfully
90	*/
91	public Instances(Reader reader) throws IOException {
92
93	StreamTokenizer tokenizer;
94
95	tokenizer = new StreamTokenizer(reader);
96	initTokenizer(tokenizer);
97	readHeader(tokenizer);
98	m_ClassIndex = -1;
99	m_Instances = new FastVector(1000);
100	while (getInstance(tokenizer, true)) {};
101	compactify();
102	}
103
104	/**
105	* Reads the header of an ARFF file from a reader and
106	* reserves space for the given number of instances. Lets
107	* the class index be undefined (negative).
108	*
109	* @param reader the reader
110	* @param capacity the capacity
111	* @exception IllegalArgumentException if the header is not read successfully
112	* or the capacity is negative.
113	* @exception IOException if there is a problem with the reader.
114	*/
115	public Instances(Reader reader, int capacity) throws IOException {
116
117	StreamTokenizer tokenizer;
118
119	if (capacity < 0) {
120	throw new IllegalArgumentException("Capacity has to be positive!");
121	}
122	tokenizer = new StreamTokenizer(reader);
123	initTokenizer(tokenizer);
124	readHeader(tokenizer);
125	m_ClassIndex = -1;
126	m_Instances = new FastVector(capacity);
127	}
128
129	/**
130	* Constructor copying all instances and references to
131	* the header information from the given set of instances.
132	*
133	* @param instances the set to be copied
134	*/
135	public Instances(Instances dataset) {
136
137	this(dataset, dataset.numInstances());
138
139	dataset.copyInstances(0, this, dataset.numInstances());
140	}
141
142	/**
143	* Constructor creating an empty set of instances. Copies references
144	* to the header information from the given set of instances. Sets
145	* the capacity of the set of instances to 0 if its negative.
146	*
147	* @param instances the instances from which the header
148	* information is to be taken
149	* @param capacity the capacity of the new dataset
150	*/
151	public Instances(Instances dataset, int capacity) {
152
153	if (capacity < 0) {
154	capacity = 0;
155	}
156
157	// Strings only have to be "shallow" copied because
158	// they can't be modified.
159	m_ClassIndex = dataset.m_ClassIndex;
160	m_RelationName = dataset.m_RelationName;
161	m_Attributes = dataset.m_Attributes;
162	m_Instances = new FastVector(capacity);
163	}
164
165	/**
166	* Creates a new set of instances by copying a
167	* subset of another set.
168	*
169	* @param source the set of instances from which a subset
170	* is to be created
171	* @param first the index of the first instance to be copied
172	* @param toCopy the number of instances to be copied
173	* @exception IllegalArgumentException if first and toCopy are out of range
174	*/
175	public Instances(Instances source, int first, int toCopy) {
176
177	this(source, toCopy);
178
179	if ((first < 0) \|\| ((first + toCopy) > source.numInstances())) {
180	throw new IllegalArgumentException("Parameters first and/or toCopy out "+
181	"of range");
182	}
183	source.copyInstances(first, this, toCopy);
184	}
185
186	/**
187	* Creates an empty set of instances. Uses the given
188	* attribute information. Sets the capacity of the set of
189	* instances to 0 if its negative. Given attribute information
190	* must not be changed after this constructor has been used.
191	*
192	* @param name the name of the relation
193	* @param attInfo the attribute information
194	* @param capacity the capacity of the set
195	*/
196	public Instances(String name, FastVector attInfo, int capacity) {
197
198	m_RelationName = name;
199	m_ClassIndex = -1;
200	m_Attributes = attInfo;
201	for (int i = 0; i < numAttributes(); i++) {
202	attribute(i).setIndex(i);
203	}
204	m_Instances = new FastVector(capacity);
205	}
206
207	/**
208	* Create a copy of the structure, but "cleanse" string types (i.e.
209	* doesn't contain references to the strings seen in the past).
210	*
211	* @return a copy of the instance structure.
212	*/
213	public Instances stringFreeStructure() {
214
215	FastVector atts = (FastVector)m_Attributes.copy();
216	for (int i = 0 ; i < atts.size(); i++) {
217	Attribute att = (Attribute)atts.elementAt(i);
218	if (att.type() == Attribute.STRING) {
219	atts.setElementAt(new Attribute(att.name(), null), i);
220	}
221	}
222	Instances result = new Instances(relationName(), atts, 0);
223	result.m_ClassIndex = m_ClassIndex;
224	return result;
225	}
226
227	/**
228	* Adds one instance to the end of the set.
229	* Shallow copies instance before it is added. Increases the
230	* size of the dataset if it is not large enough. Does not
231	* check if the instance is compatible with the dataset.
232	*
233	* @param instance the instance to be added
234	*/
235	public final void add(Instance instance) {
236
237	Instance newInstance = (Instance)instance.copy();
238
239	newInstance.setDataset(this);
240	m_Instances.addElement(newInstance);
241	}
242
243	/**
244	* Returns an attribute.
245	*
246	* @param index the attribute's index
247	* @return the attribute at the given position
248	*/
249	public final Attribute attribute(int index) {
250
251	return (Attribute) m_Attributes.elementAt(index);
252	}
253
254	/**
255	* Returns an attribute given its name. If there is more than
256	* one attribute with the same name, it returns the first one.
257	* Returns null if the attribute can't be found.
258	*
259	* @param name the attribute's name
260	* @return the attribute with the given name, null if the
261	* attribute can't be found
262	*/
263	public final Attribute attribute(String name) {
264
265	for (int i = 0; i < numAttributes(); i++) {
266	if (attribute(i).name().equals(name)) {
267	return attribute(i);
268	}
269	}
270	return null;
271	}
272
273	/**
274	* Checks for string attributes in the dataset
275	*
276	* @return true if string attributes are present, false otherwise
277	*/
278	public boolean checkForStringAttributes() {
279
280	int i = 0;
281
282	while (i < m_Attributes.size()) {
283	if (attribute(i++).isString()) {
284	return true;
285	}
286	}
287	return false;
288	}
289
290	/**
291	* Checks if the given instance is compatible
292	* with this dataset. Only looks at the size of
293	* the instance and the ranges of the values for
294	* nominal and string attributes.
295	*
296	* @return true if the instance is compatible with the dataset
297	*/
298	public final boolean checkInstance(Instance instance) {
299
300	if (instance.numAttributes() != numAttributes()) {
301	return false;
302	}
303	for (int i = 0; i < numAttributes(); i++) {
304	if (instance.isMissing(i)) {
305	continue;
306	} else if (attribute(i).isNominal() \|\|
307	attribute(i).isString()) {
308	if (!(Utils.eq(instance.value(i),
309	(double)(int)instance.value(i)))) {
310	return false;
311	} else if (Utils.sm(instance.value(i), 0) \|\|
312	Utils.gr(instance.value(i),
313	attribute(i).numValues())) {
314	return false;
315	}
316	}
317	}
318	return true;
319	}
320
321	/**
322	* Returns the class attribute.
323	*
324	* @return the class attribute
325	* @exception UnassignedClassException if the class is not set
326	*/
327	public final Attribute classAttribute() {
328
329	if (m_ClassIndex < 0) {
330	throw new UnassignedClassException("Class index is negative (not set)!");
331	}
332	return attribute(m_ClassIndex);
333	}
334
335	/**
336	* Returns the class attribute's index. Returns negative number
337	* if it's undefined.
338	*
339	* @return the class index as an integer
340	*/
341	public final int classIndex() {
342
343	return m_ClassIndex;
344	}
345
346	/**
347	* Compactifies the set of instances. Decreases the capacity of
348	* the set so that it matches the number of instances in the set.
349	*/
350	public final void compactify() {
351
352	m_Instances.trimToSize();
353	}
354
355	/**
356	* Removes all instances from the set.
357	*/
358	public final void delete() {
359
360	m_Instances = new FastVector();
361	}
362
363	/**
364	* Removes an instance at the given position from the set.
365	*
366	* @param index the instance's position
367	*/
368	public final void delete(int index) {
369
370	m_Instances.removeElementAt(index);
371	}
372
373	/**
374	* Deletes an attribute at the given position
375	* (0 to numAttributes() - 1). A deep copy of the attribute
376	* information is performed before the attribute is deleted.
377	*
378	* @param pos the attribute's position
379	* @exception IllegalArgumentException if the given index is out of range or the
380	* class attribute is being deleted
381	*/
382	public void deleteAttributeAt(int position) {
383
384	if ((position < 0) \|\| (position >= m_Attributes.size())) {
385	throw new IllegalArgumentException("Index out of range");
386	}
387	if (position == m_ClassIndex) {
388	throw new IllegalArgumentException("Can't delete class attribute");
389	}
390	freshAttributeInfo();
391	if (m_ClassIndex > position) {
392	m_ClassIndex--;
393	}
394	m_Attributes.removeElementAt(position);
395	for (int i = position; i < m_Attributes.size(); i++) {
396	Attribute current = (Attribute)m_Attributes.elementAt(i);
397	current.setIndex(current.index() - 1);
398	}
399	for (int i = 0; i < numInstances(); i++) {
400	instance(i).forceDeleteAttributeAt(position);
401	}
402	}
403
404	/**
405	* Deletes all string attributes in the dataset. A deep copy of the attribute
406	* information is performed before an attribute is deleted.
407	*
408	* @exception IllegalArgumentException if string attribute couldn't be
409	* successfully deleted (probably because it is the class attribute).
410	*/
411	public void deleteStringAttributes() {
412
413	int i = 0;
414	while (i < m_Attributes.size()) {
415	if (attribute(i).isString()) {
416	deleteAttributeAt(i);
417	} else {
418	i++;
419	}
420	}
421	}
422
423	/**
424	* Removes all instances with missing values for a particular
425	* attribute from the dataset.
426	*
427	* @param attIndex the attribute's index
428	*/
429	public final void deleteWithMissing(int attIndex) {
430
431	FastVector newInstances = new FastVector(numInstances());
432
433	for (int i = 0; i < numInstances(); i++) {
434	if (!instance(i).isMissing(attIndex)) {
435	newInstances.addElement(instance(i));
436	}
437	}
438	m_Instances = newInstances;
439	}
440
441	/**
442	* Removes all instances with missing values for a particular
443	* attribute from the dataset.
444	*
445	* @param att the attribute
446	*/
447	public final void deleteWithMissing(Attribute att) {
448
449	deleteWithMissing(att.index());
450	}
451
452	/**
453	* Removes all instances with a missing class value
454	* from the dataset.
455	*
456	* @exception UnassignedClassException if class is not set
457	*/
458	public final void deleteWithMissingClass() {
459
460	if (m_ClassIndex < 0) {
461	throw new UnassignedClassException("Class index is negative (not set)!");
462	}
463	deleteWithMissing(m_ClassIndex);
464	}
465
466	/**
467	* Returns an enumeration of all the attributes.
468	*
469	* @return enumeration of all the attributes.
470	*/
471	public Enumeration enumerateAttributes() {
472
473	return m_Attributes.elements(m_ClassIndex);
474	}
475
476	/**
477	* Returns an enumeration of all instances in the dataset.
478	*
479	* @return enumeration of all instances in the dataset
480	*/
481	public final Enumeration enumerateInstances() {
482
483	return m_Instances.elements();
484	}
485
486	/**
487	* Checks if two headers are equivalent.
488	*
489	* @param dataset another dataset
490	* @return true if the header of the given dataset is equivalent
491	* to this header
492	*/
493	public final boolean equalHeaders(Instances dataset){
494
495	// Check class and all attributes
496	if (m_ClassIndex != dataset.m_ClassIndex) {
497	return false;
498	}
499	if (m_Attributes.size() != dataset.m_Attributes.size()) {
500	return false;
501	}
502	for (int i = 0; i < m_Attributes.size(); i++) {
503	if (!(attribute(i).equals(dataset.attribute(i)))) {
504	return false;
505	}
506	}
507	return true;
508	}
509
510	/**
511	* Returns the first instance in the set.
512	*
513	* @return the first instance in the set
514	*/
515	public final Instance firstInstance() {
516
517	return (Instance)m_Instances.firstElement();
518	}
519
520	/**
521	* Inserts an attribute at the given position (0 to
522	* numAttributes()) and sets all values to be missing.
523	* Shallow copies the attribute before it is inserted, and performs
524	* a deep copy of the existing attribute information.
525	*
526	* @param att the attribute to be inserted
527	* @param pos the attribute's position
528	* @exception IllegalArgumentException if the given index is out of range
529	*/
530	public void insertAttributeAt(Attribute att, int position) {
531
532	if ((position < 0) \|\|
533	(position > m_Attributes.size())) {
534	throw new IllegalArgumentException("Index out of range");
535	}
536	att = (Attribute)att.copy();
537	freshAttributeInfo();
538	att.setIndex(position);
539	m_Attributes.insertElementAt(att, position);
540	for (int i = position + 1; i < m_Attributes.size(); i++) {
541	Attribute current = (Attribute)m_Attributes.elementAt(i);
542	current.setIndex(current.index() + 1);
543	}
544	for (int i = 0; i < numInstances(); i++) {
545	instance(i).forceInsertAttributeAt(position);
546	}
547	if (m_ClassIndex >= position) {
548	m_ClassIndex++;
549	}
550	}
551
552	/**
553	* Returns the instance at the given position.
554	*
555	* @param index the instance's index
556	* @return the instance at the given position
557	*/
558	public final Instance instance(int index) {
559
560	return (Instance)m_Instances.elementAt(index);
561	}
562
563	/**
564	* Returns the last instance in the set.
565	*
566	* @return the last instance in the set
567	*/
568	public final Instance lastInstance() {
569
570	return (Instance)m_Instances.lastElement();
571	}
572
573	/**
574	* Returns the mean (mode) for a numeric (nominal) attribute as
575	* a floating-point value. Returns 0 if the attribute is neither nominal nor
576	* numeric. If all values are missing it returns zero.
577	*
578	* @param attIndex the attribute's index
579	* @return the mean or the mode
580	*/
581	public final double meanOrMode(int attIndex) {
582
583	double result, found;
584	int [] counts;
585
586	if (attribute(attIndex).isNumeric()) {
587	result = found = 0;
588	for (int j = 0; j < numInstances(); j++) {
589	if (!instance(j).isMissing(attIndex)) {
590	found += instance(j).weight();
591	result += instance(j).weight()*instance(j).value(attIndex);
592	}
593	}
594	if (Utils.eq(found, 0)) {
595	return 0;
596	} else {
597	return result / found;
598	}
599	} else if (attribute(attIndex).isNominal()) {
600	counts = new int[attribute(attIndex).numValues()];
601	for (int j = 0; j < numInstances(); j++) {
602	if (!instance(j).isMissing(attIndex)) {
603	counts[(int) instance(j).value(attIndex)] += instance(j).weight();
604	}
605	}
606	return (double)Utils.maxIndex(counts);
607	} else {
608	return 0;
609	}
610	}
611
612	/**
613	* Returns the mean (mode) for a numeric (nominal) attribute as a
614	* floating-point value. Returns 0 if the attribute is neither
615	* nominal nor numeric. If all values are missing it returns zero.
616	*
617	* @param att the attribute
618	* @return the mean or the mode
619	*/
620	public final double meanOrMode(Attribute att) {
621
622	return meanOrMode(att.index());
623	}
624
625	/**
626	* Returns the number of attributes.
627	*
628	* @return the number of attributes as an integer
629	*/
630	public final int numAttributes() {
631
632	return m_Attributes.size();
633	}
634
635	/**
636	* Returns the number of class labels.
637	*
638	* @return the number of class labels as an integer if the class
639	* attribute is nominal, 1 otherwise.
640	* @exception UnassignedClassException if the class is not set
641	*/
642	public final int numClasses() {
643
644	if (m_ClassIndex < 0) {
645	throw new UnassignedClassException("Class index is negative (not set)!");
646	}
647	if (!classAttribute().isNominal()) {
648	return 1;
649	} else {
650	return classAttribute().numValues();
651	}
652	}
653
654	/**
655	* Returns the number of distinct values of a given attribute.
656	* Returns the number of instances if the attribute is a
657	* string attribute. The value 'missing' is not counted.
658	*
659	* @param attIndex the attribute
660	* @return the number of distinct values of a given attribute
661	*/
662	public final int numDistinctValues(int attIndex) {
663
664	if (attribute(attIndex).isNumeric()) {
665	double [] attVals = attributeToDoubleArray(attIndex);
666	int [] sorted = Utils.sort(attVals);
667	double prev = 0;
668	int counter = 0;
669	for (int i = 0; i < sorted.length; i++) {
670	Instance current = instance(sorted[i]);
671	if (current.isMissing(attIndex)) {
672	break;
673	}
674	if ((i == 0) \|\|
675	Utils.gr(current.value(attIndex), prev)) {
676	prev = current.value(attIndex);
677	counter++;
678	}
679	}
680	return counter;
681	} else {
682	return attribute(attIndex).numValues();
683	}
684	}
685
686	/**
687	* Returns the number of distinct values of a given attribute.
688	* Returns the number of instances if the attribute is a
689	* string attribute. The value 'missing' is not counted.
690	*
691	* @param att the attribute
692	* @return the number of distinct values of a given attribute
693	*/
694	public final int numDistinctValues(Attribute att) {
695
696	return numDistinctValues(att.index());
697	}
698
699	/**
700	* Returns the number of instances in the dataset.
701	*
702	* @return the number of instances in the dataset as an integer
703	*/
704	public final int numInstances() {
705
706	return m_Instances.size();
707	}
708
709	/**
710	* Shuffles the instances in the set so that they are ordered
711	* randomly.
712	*
713	* @param random a random number generator
714	*/
715	public final void randomize(Random random) {
716
717	for (int j = numInstances() - 1; j > 0; j--)
718	swap(j,(int)(random.nextDouble()*(double)j));
719	}
720
721	/**
722	* Reads a single instance from the reader and appends it
723	* to the dataset. Automatically expands the dataset if it
724	* is not large enough to hold the instance. This method does
725	* not check for carriage return at the end of the line.
726	*
727	* @param reader the reader
728	* @return false if end of file has been reached
729	* @exception IOException if the information is not read
730	* successfully
731	*/
732	public final boolean readInstance(Reader reader)
733	throws IOException {
734
735	StreamTokenizer tokenizer = new StreamTokenizer(reader);
736
737	initTokenizer(tokenizer);
738	return getInstance(tokenizer, false);
739	}
740
741	/**
742	* Returns the relation's name.
743	*
744	* @return the relation's name as a string
745	*/
746	public final String relationName() {
747
748	return m_RelationName;
749	}
750
751	/**
752	* Renames an attribute. This change only affects this
753	* dataset.
754	*
755	* @param att the attribute's index
756	* @param name the new name
757	*/
758	public final void renameAttribute(int att, String name) {
759
760	Attribute newAtt = attribute(att).copy(name);
761	FastVector newVec = new FastVector(numAttributes());
762
763	for (int i = 0; i < numAttributes(); i++) {
764	if (i == att) {
765	newVec.addElement(newAtt);
766	} else {
767	newVec.addElement(attribute(i));
768	}
769	}
770	m_Attributes = newVec;
771	}
772
773	/**
774	* Renames an attribute. This change only affects this
775	* dataset.
776	*
777	* @param att the attribute
778	* @param name the new name
779	*/
780	public final void renameAttribute(Attribute att, String name) {
781
782	renameAttribute(att.index(), name);
783	}
784
785	/**
786	* Renames the value of a nominal (or string) attribute value. This
787	* change only affects this dataset.
788	*
789	* @param att the attribute's index
790	* @param val the value's index
791	* @param name the new name
792	*/
793	public final void renameAttributeValue(int att, int val, String name) {
794
795	Attribute newAtt = (Attribute)attribute(att).copy();
796	FastVector newVec = new FastVector(numAttributes());
797
798	newAtt.setValue(val, name);
799	for (int i = 0; i < numAttributes(); i++) {
800	if (i == att) {
801	newVec.addElement(newAtt);
802	} else {
803	newVec.addElement(attribute(i));
804	}
805	}
806	m_Attributes = newVec;
807	}
808
809	/**
810	* Renames the value of a nominal (or string) attribute value. This
811	* change only affects this dataset.
812	*
813	* @param att the attribute
814	* @param val the value
815	* @param name the new name
816	*/
817	public final void renameAttributeValue(Attribute att, String val,
818	String name) {
819
820	renameAttributeValue(att.index(), att.indexOfValue(val), name);
821	}
822
823	/**
824	* Creates a new dataset of the same size using random sampling
825	* with replacement.
826	*
827	* @param random a random number generator
828	* @return the new dataset
829	*/
830	public final Instances resample(Random random) {
831
832	Instances newData = new Instances(this, numInstances());
833	while (newData.numInstances() < numInstances()) {
834	int i = (int) (random.nextDouble() * (double) numInstances());
835	newData.add(instance(i));
836	}
837	return newData;
838	}
839
840	/**
841	* Creates a new dataset of the same size using random sampling
842	* with replacement according to the current instance weights. The
843	* weights of the instances in the new dataset are set to one.
844	*
845	* @param random a random number generator
846	* @return the new dataset
847	*/
848	public final Instances resampleWithWeights(Random random) {
849
850	double [] weights = new double[numInstances()];
851	boolean foundOne = false;
852	for (int i = 0; i < weights.length; i++) {
853	weights[i] = instance(i).weight();
854	if (!Utils.eq(weights[i], weights[0])) {
855	foundOne = true;
856	}
857	}
858	if (foundOne) {
859	return resampleWithWeights(random, weights);
860	} else {
861	return new Instances(this);
862	}
863	}
864
865
866	/**
867	* Creates a new dataset of the same size using random sampling
868	* with replacement according to the given weight vector. The
869	* weights of the instances in the new dataset are set to one.
870	* The length of the weight vector has to be the same as the
871	* number of instances in the dataset, and all weights have to
872	* be positive.
873	*
874	* @param random a random number generator
875	* @param weights the weight vector
876	* @return the new dataset
877	* @exception IllegalArgumentException if the weights array is of the wrong
878	* length or contains negative weights.
879	*/
880	public final Instances resampleWithWeights(Random random,
881	double[] weights) {
882
883	if (weights.length != numInstances()) {
884	throw new IllegalArgumentException("weights.length != numInstances.");
885	}
886	Instances newData = new Instances(this, numInstances());
887	double[] probabilities = new double[numInstances()];
888	double sumProbs = 0, sumOfWeights = Utils.sum(weights);
889	for (int i = 0; i < numInstances(); i++) {
890	sumProbs += random.nextDouble();
891	probabilities[i] = sumProbs;
892	}
893	Utils.normalize(probabilities, sumProbs / sumOfWeights);
894
895	// Make sure that rounding errors don't mess things up
896	probabilities[numInstances() - 1] = sumOfWeights;
897	int k = 0; int l = 0;
898	sumProbs = 0;
899	while ((k < numInstances() && (l < numInstances()))) {
900	if (weights[l] < 0) {
901	throw new IllegalArgumentException("Weights have to be positive.");
902	}
903	sumProbs += weights[l];
904	while ((k < numInstances()) &&
905	(probabilities[k] <= sumProbs)) {
906	newData.add(instance(l));
907	newData.instance(k).setWeight(1);
908	k++;
909	}
910	l++;
911	}
912	return newData;
913	}
914
915	/**
916	* Sets the class attribute.
917	*
918	* @param att attribute to be the class
919	*/
920	public final void setClass(Attribute att) {
921
922	m_ClassIndex = att.index();
923	}
924
925	/**
926	* Sets the class index of the set.
927	* If the class index is negative there is assumed to be no class.
928	* (ie. it is undefined)
929	*
930	* @param classIndex the new class index
931	* @exception IllegalArgumentException if the class index is too big or < 0
932	*/
933	public final void setClassIndex(int classIndex) {
934
935	if (classIndex >= numAttributes()) {
936	throw new IllegalArgumentException("Invalid class index: " + classIndex);
937	}
938	m_ClassIndex = classIndex;
939	}
940
941	/**
942	* Sets the relation's name.
943	*
944	* @param newName the new relation name.
945	*/
946	public final void setRelationName(String newName) {
947
948	m_RelationName = newName;
949	}
950
951	/**
952	* Sorts the instances based on an attribute. For numeric attributes,
953	* instances are sorted in ascending order. For nominal attributes,
954	* instances are sorted based on the attribute label ordering
955	* specified in the header. Instances with missing values for the
956	* attribute are placed at the end of the dataset.
957	*
958	* @param attIndex the attribute's index
959	*/
960	public final void sort(int attIndex) {
961
962	int i,j;
963
964	// move all instances with missing values to end
965	j = numInstances() - 1;
966	i = 0;
967	while (i <= j) {
968	if (instance(j).isMissing(attIndex)) {
969	j--;
970	} else {
971	if (instance(i).isMissing(attIndex)) {
972	swap(i,j);
973	j--;
974	}
975	i++;
976	}
977	}
978	quickSort(attIndex, 0, j);
979	}
980
981	/**
982	* Sorts the instances based on an attribute. For numeric attributes,
983	* instances are sorted into ascending order. For nominal attributes,
984	* instances are sorted based on the attribute label ordering
985	* specified in the header. Instances with missing values for the
986	* attribute are placed at the end of the dataset.
987	*
988	* @param att the attribute
989	*/
990	public final void sort(Attribute att) {
991
992	sort(att.index());
993	}
994
995	/**
996	* Stratifies a set of instances according to its class values
997	* if the class attribute is nominal (so that afterwards a
998	* stratified cross-validation can be performed).
999	*
1000	* @param numFolds the number of folds in the cross-validation
1001	* @exception UnassignedClassException if the class is not set
1002	*/
1003	public final void stratify(int numFolds) {
1004
1005	if (numFolds <= 0) {
1006	throw new IllegalArgumentException("Number of folds must be greater than 1");
1007	}
1008	if (m_ClassIndex < 0) {
1009	throw new UnassignedClassException("Class index is negative (not set)!");
1010	}
1011	if (classAttribute().isNominal()) {
1012
1013	// sort by class
1014	int index = 1;
1015	while (index < numInstances()) {
1016	Instance instance1 = instance(index - 1);
1017	for (int j = index; j < numInstances(); j++) {
1018	Instance instance2 = instance(j);
1019	if ((instance1.classValue() == instance2.classValue()) \|\|
1020	(instance1.classIsMissing() &&
1021	instance2.classIsMissing())) {
1022	swap(index,j);
1023	index++;
1024	}
1025	}
1026	index++;
1027	}
1028	stratStep(numFolds);
1029	}
1030	}
1031
1032	/**
1033	* Computes the sum of all the instances' weights.
1034	*
1035	* @return the sum of all the instances' weights as a double
1036	*/
1037	public final double sumOfWeights() {
1038
1039	double sum = 0;
1040
1041	for (int i = 0; i < numInstances(); i++) {
1042	sum += instance(i).weight();
1043	}
1044	return sum;
1045	}
1046
1047	/**
1048	* Creates the test set for one fold of a cross-validation on
1049	* the dataset.
1050	*
1051	* @param numFolds the number of folds in the cross-validation. Must
1052	* be greater than 1.
1053	* @param numFold 0 for the first fold, 1 for the second, ...
1054	* @return the test set as a set of weighted instances
1055	* @exception IllegalArgumentException if the number of folds is less than 2
1056	* or greater than the number of instances.
1057	*/
1058	public Instances testCV(int numFolds, int numFold) {
1059
1060	int numInstForFold, first, offset;
1061	Instances test;
1062
1063	if (numFolds < 2) {
1064	throw new IllegalArgumentException("Number of folds must be at least 2!");
1065	}
1066	if (numFolds > numInstances()) {
1067	throw new IllegalArgumentException("Can't have more folds than instances!");
1068	}
1069	numInstForFold = numInstances() / numFolds;
1070	if (numFold < numInstances() % numFolds){
1071	numInstForFold++;
1072	offset = numFold;
1073	}else
1074	offset = numInstances() % numFolds;
1075	test = new Instances(this, numInstForFold);
1076	first = numFold * (numInstances() / numFolds) + offset;
1077	copyInstances(first, test, numInstForFold);
1078	return test;
1079	}
1080
1081	/**
1082	* Returns the dataset as a string in ARFF format. Strings
1083	* are quoted if they contain whitespace characters, or if they
1084	* are a question mark.
1085	*
1086	* @return the dataset in ARFF format as a string
1087	*/
1088	public final String toString() {
1089
1090	StringBuffer text = new StringBuffer();
1091
1092	text.append("@relation " + Utils.quote(m_RelationName) + "\n\n");
1093	for (int i = 0; i < numAttributes(); i++) {
1094	text.append(attribute(i) + "\n");
1095	}
1096	text.append("\n@data\n");
1097	for (int i = 0; i < numInstances(); i++) {
1098	text.append(instance(i));
1099	if (i < numInstances() - 1) {
1100	text.append('\n');
1101	}
1102	}
1103	return text.toString();
1104	}
1105
1106	/**
1107	* Creates the training set for one fold of a cross-validation
1108	* on the dataset.
1109	*
1110	* @param numFolds the number of folds in the cross-validation. Must
1111	* be greater than 1.
1112	* @param numFold 0 for the first fold, 1 for the second, ...
1113	* @return the training set as a set of weighted
1114	* instances
1115	* @exception IllegalArgumentException if the number of folds is less than 2
1116	* or greater than the number of instances.
1117	*/
1118	public Instances trainCV(int numFolds, int numFold) {
1119
1120	int numInstForFold, first, offset;
1121	Instances train;
1122
1123	if (numFolds < 2) {
1124	throw new IllegalArgumentException("Number of folds must be at least 2!");
1125	}
1126	if (numFolds > numInstances()) {
1127	throw new IllegalArgumentException("Can't have more folds than instances!");
1128	}
1129	numInstForFold = numInstances() / numFolds;
1130	if (numFold < numInstances() % numFolds) {
1131	numInstForFold++;
1132	offset = numFold;
1133	}else
1134	offset = numInstances() % numFolds;
1135	train = new Instances(this, numInstances() - numInstForFold);
1136	first = numFold * (numInstances() / numFolds) + offset;
1137	copyInstances(0, train, first);
1138	copyInstances(first + numInstForFold, train,
1139	numInstances() - first - numInstForFold);
1140
1141	return train;
1142	}
1143
1144	/**
1145	* Computes the variance for a numeric attribute.
1146	*
1147	* @param attIndex the numeric attribute
1148	* @return the variance if the attribute is numeric
1149	* @exception IllegalArgumentException if the attribute is not numeric
1150	*/
1151	public final double variance(int attIndex) {
1152
1153	double sum = 0, sumSquared = 0, sumOfWeights = 0;
1154
1155	if (!attribute(attIndex).isNumeric()) {
1156	throw new IllegalArgumentException("Can't compute variance because attribute is " +
1157	"not numeric!");
1158	}
1159	for (int i = 0; i < numInstances(); i++) {
1160	if (!instance(i).isMissing(attIndex)) {
1161	sum += instance(i).weight() *
1162	instance(i).value(attIndex);
1163	sumSquared += instance(i).weight() *
1164	instance(i).value(attIndex) *
1165	instance(i).value(attIndex);
1166	sumOfWeights += instance(i).weight();
1167	}
1168	}
1169	if (Utils.smOrEq(sumOfWeights, 1)) {
1170	return 0;
1171	}
1172	return (sumSquared - (sum * sum / sumOfWeights)) /
1173	(sumOfWeights - 1);
1174	}
1175
1176	/**
1177	* Computes the variance for a numeric attribute.
1178	*
1179	* @param att the numeric attribute
1180	* @return the variance if the attribute is numeric
1181	* @exception IllegalArgumentException if the attribute is not numeric
1182	*/
1183	public final double variance(Attribute att) {
1184
1185	return variance(att.index());
1186	}
1187
1188	/**
1189	* Calculates summary statistics on the values that appear in this
1190	* set of instances for a specified attribute.
1191	*
1192	* @param index the index of the attribute to summarize.
1193	* @return an AttributeStats object with it's fields calculated.
1194	*/
1195	public AttributeStats attributeStats(int index) {
1196
1197	AttributeStats result = new AttributeStats();
1198	if (attribute(index).isNominal()) {
1199	result.nominalCounts = new int [attribute(index).numValues()];
1200	}
1201	if (attribute(index).isNumeric()) {
1202	result.numericStats = new weka.experiment.Stats();
1203	}
1204	result.totalCount = numInstances();
1205
1206	double [] attVals = attributeToDoubleArray(index);
1207	int [] sorted = Utils.sort(attVals);
1208	int currentCount = 0;
1209	double prev = Instance.missingValue();
1210	for (int j = 0; j < numInstances(); j++) {
1211	Instance current = instance(sorted[j]);
1212	if (current.isMissing(index)) {
1213	result.missingCount = numInstances() - j;
1214	break;
1215	}
1216	if (Utils.eq(current.value(index), prev)) {
1217	currentCount++;
1218	} else {
1219	result.addDistinct(prev, currentCount);
1220	currentCount = 1;
1221	prev = current.value(index);
1222	}
1223	}
1224	result.addDistinct(prev, currentCount);
1225	result.distinctCount--; // So we don't count "missing" as a value
1226	return result;
1227	}
1228
1229	/**
1230	* Gets the value of all instances in this dataset for a particular
1231	* attribute. Useful in conjunction with Utils.sort to allow iterating
1232	* through the dataset in sorted order for some attribute.
1233	*
1234	* @param index the index of the attribute.
1235	* @return an array containing the value of the desired attribute for
1236	* each instance in the dataset.
1237	*/
1238	public double [] attributeToDoubleArray(int index) {
1239
1240	double [] result = new double[numInstances()];
1241	for (int i = 0; i < result.length; i++) {
1242	result[i] = instance(i).value(index);
1243	}
1244	return result;
1245	}
1246
1247	/**
1248	* Generates a string summarizing the set of instances. Gives a breakdown
1249	* for each attribute indicating the number of missing/discrete/unique
1250	* values and other information.
1251	*
1252	* @return a string summarizing the dataset
1253	*/
1254	public String toSummaryString() {
1255
1256	StringBuffer result = new StringBuffer();
1257	result.append("Relation Name: ").append(relationName()).append('\n');
1258	result.append("Num Instances: ").append(numInstances()).append('\n');
1259	result.append("Num Attributes: ").append(numAttributes()).append('\n');
1260	result.append('\n');
1261
1262	result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
1263	result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
1264	result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
1265	result.append(Utils.padLeft("Missing", 12));
1266	result.append(Utils.padLeft("Unique", 12));
1267	result.append(Utils.padLeft("Dist", 6)).append('\n');
1268	for (int i = 0; i < numAttributes(); i++) {
1269	Attribute a = attribute(i);
1270	AttributeStats as = attributeStats(i);
1271	result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
1272	result.append(Utils.padRight(a.name(), 25)).append(' ');
1273	long percent;
1274	switch (a.type()) {
1275	case Attribute.NOMINAL:
1276	result.append(Utils.padLeft("Nom", 4)).append(' ');
1277	percent = Math.round(100.0 * as.intCount / as.totalCount);
1278	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1279	result.append(Utils.padLeft("" + 0, 3)).append("% ");
1280	percent = Math.round(100.0 * as.realCount / as.totalCount);
1281	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1282	break;
1283	case Attribute.NUMERIC:
1284	result.append(Utils.padLeft("Num", 4)).append(' ');
1285	result.append(Utils.padLeft("" + 0, 3)).append("% ");
1286	percent = Math.round(100.0 * as.intCount / as.totalCount);
1287	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1288	percent = Math.round(100.0 * as.realCount / as.totalCount);
1289	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1290	break;
1291	case Attribute.STRING:
1292	result.append(Utils.padLeft("Str", 4)).append(' ');
1293	percent = Math.round(100.0 * as.intCount / as.totalCount);
1294	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1295	result.append(Utils.padLeft("" + 0, 3)).append("% ");
1296	percent = Math.round(100.0 * as.realCount / as.totalCount);
1297	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1298	break;
1299	default:
1300	result.append(Utils.padLeft("???", 4)).append(' ');
1301	result.append(Utils.padLeft("" + 0, 3)).append("% ");
1302	percent = Math.round(100.0 * as.intCount / as.totalCount);
1303	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1304	percent = Math.round(100.0 * as.realCount / as.totalCount);
1305	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1306	break;
1307	}
1308	result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
1309	percent = Math.round(100.0 * as.missingCount / as.totalCount);
1310	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1311	result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
1312	percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
1313	result.append(Utils.padLeft("" + percent, 3)).append("% ");
1314	result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
1315	result.append('\n');
1316	}
1317	return result.toString();
1318	}
1319
1320	/**
1321	* Reads a single instance using the tokenizer and appends it
1322	* to the dataset. Automatically expands the dataset if it
1323	* is not large enough to hold the instance.
1324	*
1325	* @param tokenizer the tokenizer to be used
1326	* @param flag if method should test for carriage return after
1327	* each instance
1328	* @return false if end of file has been reached
1329	* @exception IOException if the information is not read
1330	* successfully
1331	*/
1332	protected boolean getInstance(StreamTokenizer tokenizer,
1333	boolean flag)
1334	throws IOException {
1335
1336	// Check if any attributes have been declared.
1337	if (m_Attributes.size() == 0) {
1338	errms(tokenizer,"no header information available");
1339	}
1340
1341	// Check if end of file reached.
1342	getFirstToken(tokenizer);
1343	if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1344	return false;
1345	}
1346
1347	// Parse instance
1348	if (tokenizer.ttype == '{') {
1349	return getInstanceSparse(tokenizer, flag);
1350	} else {
1351	return getInstanceFull(tokenizer, flag);
1352	}
1353	}
1354
1355	/**
1356	* Reads a single instance using the tokenizer and appends it
1357	* to the dataset. Automatically expands the dataset if it
1358	* is not large enough to hold the instance.
1359	*
1360	* @param tokenizer the tokenizer to be used
1361	* @param flag if method should test for carriage return after
1362	* each instance
1363	* @return false if end of file has been reached
1364	* @exception IOException if the information is not read
1365	* successfully
1366	*/
1367	protected boolean getInstanceSparse(StreamTokenizer tokenizer,
1368	boolean flag)
1369	throws IOException {
1370
1371	int valIndex, numValues = 0, maxIndex = -1;
1372
1373	// Get values
1374	do {
1375
1376	// Get index
1377	getIndex(tokenizer);
1378	if (tokenizer.ttype == '}') {
1379	break;
1380	}
1381
1382	// Is index valid?
1383	try{
1384	m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
1385	} catch (NumberFormatException e) {
1386	errms(tokenizer,"index number expected");
1387	}
1388	if (m_IndicesBuffer[numValues] <= maxIndex) {
1389	errms(tokenizer,"indices have to be ordered");
1390	}
1391	if ((m_IndicesBuffer[numValues] < 0) \|\|
1392	(m_IndicesBuffer[numValues] >= numAttributes())) {
1393	errms(tokenizer,"index out of bounds");
1394	}
1395	maxIndex = m_IndicesBuffer[numValues];
1396
1397	// Get value;
1398	getNextToken(tokenizer);
1399
1400	// Check if value is missing.
1401	if (tokenizer.ttype == '?') {
1402	m_ValueBuffer[numValues] = Instance.missingValue();
1403	} else {
1404
1405	// Check if token is valid.
1406	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
1407	errms(tokenizer,"not a valid value");
1408	}
1409	if (attribute(m_IndicesBuffer[numValues]).isNominal()) {
1410
1411	// Check if value appears in header.
1412	valIndex =
1413	attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
1414	if (valIndex == -1) {
1415	errms(tokenizer,"nominal value not declared in header");
1416	}
1417	m_ValueBuffer[numValues] = (double)valIndex;
1418	} else if (attribute(m_IndicesBuffer[numValues]).isNumeric()) {
1419
1420	// Check if value is really a number.
1421	try{
1422	m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
1423	doubleValue();
1424	} catch (NumberFormatException e) {
1425	errms(tokenizer,"number expected");
1426	}
1427	} else {
1428	m_ValueBuffer[numValues] =
1429	attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
1430	}
1431	}
1432	numValues++;
1433	} while (true);
1434	if (flag) {
1435	getLastToken(tokenizer,true);
1436	}
1437
1438	// Add instance to dataset
1439	double[] tempValues = new double[numValues];
1440	int[] tempIndices = new int[numValues];
1441	System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
1442	System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
1443	add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
1444	return true;
1445	}
1446
1447	/**
1448	* Reads a single instance using the tokenizer and appends it
1449	* to the dataset. Automatically expands the dataset if it
1450	* is not large enough to hold the instance.
1451	*
1452	* @param tokenizer the tokenizer to be used
1453	* @param flag if method should test for carriage return after
1454	* each instance
1455	* @return false if end of file has been reached
1456	* @exception IOException if the information is not read
1457	* successfully
1458	*/
1459	protected boolean getInstanceFull(StreamTokenizer tokenizer,
1460	boolean flag)
1461	throws IOException {
1462
1463	double[] instance = new double[numAttributes()];
1464	int index;
1465
1466	// Get values for all attributes.
1467	for (int i = 0; i < numAttributes(); i++){
1468
1469	// Get next token
1470	if (i > 0) {
1471	getNextToken(tokenizer);
1472	}
1473
1474	// Check if value is missing.
1475	if (tokenizer.ttype == '?') {
1476	instance[i] = Instance.missingValue();
1477	} else {
1478
1479	// Check if token is valid.
1480	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
1481	errms(tokenizer,"not a valid value");
1482	}
1483	if (attribute(i).isNominal()) {
1484
1485	// Check if value appears in header.
1486	index = attribute(i).indexOfValue(tokenizer.sval);
1487	if (index == -1) {
1488	errms(tokenizer,"nominal value not declared in header");
1489	}
1490	instance[i] = (double)index;
1491	} else if (attribute(i).isNumeric()) {
1492
1493	// Check if value is really a number.
1494	try{
1495	instance[i] = Double.valueOf(tokenizer.sval).
1496	doubleValue();
1497	} catch (NumberFormatException e) {
1498	errms(tokenizer,"number expected");
1499	}
1500	} else {
1501	instance[i] = attribute(i).addStringValue(tokenizer.sval);
1502	}
1503	}
1504	}
1505	if (flag) {
1506	getLastToken(tokenizer,true);
1507	}
1508
1509	// Add instance to dataset
1510	add(new Instance(1, instance));
1511	return true;
1512	}
1513
1514	/**
1515	* Reads and stores header of an ARFF file.
1516	*
1517	* @param tokenizer the stream tokenizer
1518	* @exception IOException if the information is not read
1519	* successfully
1520	*/
1521	protected void readHeader(StreamTokenizer tokenizer)
1522	throws IOException{
1523
1524	String attributeName;
1525	FastVector attributeValues;
1526	int i;
1527
1528	// Get name of relation.
1529	getFirstToken(tokenizer);
1530	if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1531	errms(tokenizer,"premature end of file");
1532	}
1533	if (tokenizer.sval.equalsIgnoreCase("@relation")){
1534	getNextToken(tokenizer);
1535	m_RelationName = tokenizer.sval;
1536	getLastToken(tokenizer,false);
1537	} else {
1538	errms(tokenizer,"keyword @relation expected");
1539	}
1540
1541	// Create vectors to hold information temporarily.
1542	m_Attributes = new FastVector();
1543
1544	// Get attribute declarations.
1545	getFirstToken(tokenizer);
1546	if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1547	errms(tokenizer,"premature end of file");
1548	}
1549	while (tokenizer.sval.equalsIgnoreCase("@attribute")) {
1550
1551	// Get attribute name.
1552	getNextToken(tokenizer);
1553	attributeName = tokenizer.sval;
1554	getNextToken(tokenizer);
1555
1556	// Check if attribute is nominal.
1557	if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
1558
1559	// Attribute is real, integer, or string.
1560	if (tokenizer.sval.equalsIgnoreCase("real") \|\|
1561	tokenizer.sval.equalsIgnoreCase("integer") \|\|
1562	tokenizer.sval.equalsIgnoreCase("numeric")) {
1563	m_Attributes.addElement(new Attribute(attributeName,
1564	numAttributes()));
1565	readTillEOL(tokenizer);
1566	} else if (tokenizer.sval.equalsIgnoreCase("string")) {
1567	m_Attributes.
1568	addElement(new Attribute(attributeName, null,
1569	numAttributes()));
1570	readTillEOL(tokenizer);
1571	} else {
1572	errms(tokenizer,"no valid attribute type or invalid "+
1573	"enumeration");
1574	}
1575	} else {
1576
1577	// Attribute is nominal.
1578	attributeValues = new FastVector();
1579	tokenizer.pushBack();
1580
1581	// Get values for nominal attribute.
1582	if (tokenizer.nextToken() != '{') {
1583	errms(tokenizer,"{ expected at beginning of enumeration");
1584	}
1585	while (tokenizer.nextToken() != '}') {
1586	if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
1587	errms(tokenizer,"} expected at end of enumeration");
1588	} else {
1589	attributeValues.addElement(tokenizer.sval);
1590	}
1591	}
1592	if (attributeValues.size() == 0) {
1593	errms(tokenizer,"no nominal values found");
1594	}
1595	m_Attributes.
1596	addElement(new Attribute(attributeName, attributeValues,
1597	numAttributes()));
1598	}
1599	getLastToken(tokenizer,false);
1600	getFirstToken(tokenizer);
1601	if (tokenizer.ttype == StreamTokenizer.TT_EOF)
1602	errms(tokenizer,"premature end of file");
1603	}
1604
1605	// Check if data part follows. We can't easily check for EOL.
1606	if (!tokenizer.sval.equalsIgnoreCase("@data")) {
1607	errms(tokenizer,"keyword @data expected");
1608	}
1609
1610	// Check if any attributes have been declared.
1611	if (m_Attributes.size() == 0) {
1612	errms(tokenizer,"no attributes declared");
1613	}
1614
1615	// Allocate buffers in case sparse instances have to be read
1616	m_ValueBuffer = new double[numAttributes()];
1617	m_IndicesBuffer = new int[numAttributes()];
1618	}
1619
1620	/**
1621	* Copies instances from one set to the end of another
1622	* one.
1623	*
1624	* @param source the source of the instances
1625	* @param from the position of the first instance to be copied
1626	* @param dest the destination for the instances
1627	* @param num the number of instances to be copied
1628	*/
1629	private void copyInstances(int from, Instances dest, int num) {
1630
1631	for (int i = 0; i < num; i++) {
1632	dest.add(instance(from + i));
1633	}
1634	}
1635
1636	/**
1637	* Throws error message with line number and last token read.
1638	*
1639	* @param theMsg the error message to be thrown
1640	* @param tokenizer the stream tokenizer
1641	* @throws IOExcpetion containing the error message
1642	*/
1643	private void errms(StreamTokenizer tokenizer, String theMsg)
1644	throws IOException {
1645
1646	throw new IOException(theMsg + ", read " + tokenizer.toString());
1647	}
1648
1649	/**
1650	* Replaces the attribute information by a clone of
1651	* itself.
1652	*/
1653	private void freshAttributeInfo() {
1654
1655	m_Attributes = (FastVector) m_Attributes.copyElements();
1656	}
1657
1658	/**
1659	* Gets next token, skipping empty lines.
1660	*
1661	* @param tokenizer the stream tokenizer
1662	* @exception IOException if reading the next token fails
1663	*/
1664	private void getFirstToken(StreamTokenizer tokenizer)
1665	throws IOException{
1666
1667	while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};
1668	if ((tokenizer.ttype == '\'') \|\|
1669	(tokenizer.ttype == '"')) {
1670	tokenizer.ttype = StreamTokenizer.TT_WORD;
1671	} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
1672	(tokenizer.sval.equals("?"))){
1673	tokenizer.ttype = '?';
1674	}
1675	}
1676
1677	/**
1678	* Gets index, checking for a premature and of line.
1679	*
1680	* @param tokenizer the stream tokenizer
1681	* @exception IOException if it finds a premature end of line
1682	*/
1683	private void getIndex(StreamTokenizer tokenizer) throws IOException{
1684
1685	if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
1686	errms(tokenizer,"premature end of line");
1687	}
1688	if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1689	errms(tokenizer,"premature end of file");
1690	}
1691	}
1692
1693	/**
1694	* Gets token and checks if its end of line.
1695	*
1696	* @param tokenizer the stream tokenizer
1697	* @exception IOException if it doesn't find an end of line
1698	*/
1699	private void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)
1700	throws IOException{
1701
1702	if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
1703	((tokenizer.nextToken() != StreamTokenizer.TT_EOF) \|\| !endOfFileOk)) {
1704	errms(tokenizer,"end of line expected");
1705	}
1706	}
1707
1708	/**
1709	* Gets next token, checking for a premature and of line.
1710	*
1711	* @param tokenizer the stream tokenizer
1712	* @exception IOException if it finds a premature end of line
1713	*/
1714	private void getNextToken(StreamTokenizer tokenizer)
1715	throws IOException{
1716
1717	if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
1718	errms(tokenizer,"premature end of line");
1719	}
1720	if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
1721	errms(tokenizer,"premature end of file");
1722	} else if ((tokenizer.ttype == '\'') \|\|
1723	(tokenizer.ttype == '"')) {
1724	tokenizer.ttype = StreamTokenizer.TT_WORD;
1725	} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
1726	(tokenizer.sval.equals("?"))){
1727	tokenizer.ttype = '?';
1728	}
1729	}
1730
1731	/**
1732	* Initializes the StreamTokenizer used for reading the ARFF file.
1733	*
1734	* @param tokenizer the stream tokenizer
1735	*/
1736	private void initTokenizer(StreamTokenizer tokenizer){
1737
1738	tokenizer.resetSyntax();
1739	tokenizer.whitespaceChars(0, ' ');
1740	tokenizer.wordChars(' '+1,'\u00FF');
1741	tokenizer.whitespaceChars(',',',');
1742	tokenizer.commentChar('%');
1743	tokenizer.quoteChar('"');
1744	tokenizer.quoteChar('\'');
1745	tokenizer.ordinaryChar('{');
1746	tokenizer.ordinaryChar('}');
1747	tokenizer.eolIsSignificant(true);
1748	}
1749
1750	/**
1751	* Returns string including all instances, their weights and
1752	* their indices in the original dataset.
1753	*
1754	* @return description of instance and its weight as a string
1755	*/
1756	private String instancesAndWeights(){
1757
1758	StringBuffer text = new StringBuffer();
1759
1760	for (int i = 0; i < numInstances(); i++) {
1761	text.append(instance(i) + " " + instance(i).weight());
1762	if (i < numInstances() - 1) {
1763	text.append("\n");
1764	}
1765	}
1766	return text.toString();
1767	}
1768
1769	/**
1770	* Implements quicksort.
1771	*
1772	* @param attIndex the attribute's index
1773	* @param lo0 the first index of the subset to be sorted
1774	* @param hi0 the last index of the subset to be sorted
1775	*/
1776	private void quickSort(int attIndex, int lo0, int hi0) {
1777
1778	int lo = lo0, hi = hi0;
1779	double mid, midPlus, midMinus;
1780
1781	if (hi0 > lo0) {
1782
1783	// Arbitrarily establishing partition element as the
1784	// midpoint of the array.
1785	mid = instance((lo0 + hi0) / 2).value(attIndex);
1786	midPlus = mid + 1e-6;
1787	midMinus = mid - 1e-6;
1788
1789	// loop through the array until indices cross
1790	while(lo <= hi) {
1791
1792	// find the first element that is greater than or equal to
1793	// the partition element starting from the left Index.
1794	while ((instance(lo).value(attIndex) <
1795	midMinus) && (lo < hi0)) {
1796	++lo;
1797	}
1798
1799	// find an element that is smaller than or equal to
1800	// the partition element starting from the right Index.
1801	while ((instance(hi).value(attIndex) >
1802	midPlus) && (hi > lo0)) {
1803	--hi;
1804	}
1805
1806	// if the indexes have not crossed, swap
1807	if(lo <= hi) {
1808	swap(lo,hi);
1809	++lo;
1810	--hi;
1811	}
1812	}
1813
1814	// If the right index has not reached the left side of array
1815	// must now sort the left partition.
1816	if(lo0 < hi) {
1817	quickSort(attIndex,lo0,hi);
1818	}
1819
1820	// If the left index has not reached the right side of array
1821	// must now sort the right partition.
1822	if(lo < hi0) {
1823	quickSort(attIndex,lo,hi0);
1824	}
1825	}
1826	}
1827
1828	/**
1829	* Reads and skips all tokens before next end of line token.
1830	*
1831	* @param tokenizer the stream tokenizer
1832	*/
1833	private void readTillEOL(StreamTokenizer tokenizer)
1834	throws IOException{
1835
1836	while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
1837	tokenizer.pushBack();
1838	}
1839
1840	/**
1841	* Help function needed for stratification of set.
1842	*
1843	* @param numFolds the number of folds for the stratification
1844	*/
1845	private void stratStep (int numFolds){
1846
1847	FastVector newVec = new FastVector(m_Instances.capacity());
1848	int start = 0, j;
1849
1850	// create stratified batch
1851	while (newVec.size() < numInstances()) {
1852	j = start;
1853	while (j < numInstances()) {
1854	newVec.addElement(instance(j));
1855	j = j + numFolds;
1856	}
1857	start++;
1858	}
1859	m_Instances = newVec;
1860	}
1861
1862	/**
1863	* Swaps two instances in the set.
1864	*
1865	* @param i the first instance's index
1866	* @param j the second instance's index
1867	*/
1868	private void swap(int i, int j){
1869
1870	m_Instances.swap(i, j);
1871	}
1872
1873	/**
1874	* Merges two sets of Instances together. The resulting set will have
1875	* all the attributes of the first set plus all the attributes of the
1876	* second set. The number of instances in both sets must be the same.
1877	*
1878	* @param first the first set of Instances
1879	* @param second the second set of Instances
1880	* @return the merged set of Instances
1881	* @exception IllegalArgumentException if the datasets are not the same size
1882	*/
1883	public static Instances mergeInstances(Instances first, Instances second) {
1884
1885	if (first.numInstances() != second.numInstances()) {
1886	throw new IllegalArgumentException("Instance sets must be of the same size");
1887	}
1888
1889	// Create the vector of merged attributes
1890	FastVector newAttributes = new FastVector();
1891	for (int i = 0; i < first.numAttributes(); i++) {
1892	newAttributes.addElement(first.attribute(i));
1893	}
1894	for (int i = 0; i < second.numAttributes(); i++) {
1895	newAttributes.addElement(second.attribute(i));
1896	}
1897
1898	// Create the set of Instances
1899	Instances merged = new Instances(first.relationName() + '_'
1900	+ second.relationName(),
1901	newAttributes,
1902	first.numInstances());
1903	// Merge each instance
1904	for (int i = 0; i < first.numInstances(); i++) {
1905	merged.add(first.instance(i).mergeInstance(second.instance(i)));
1906	}
1907	return merged;
1908	}
1909
1910	/**
1911	* Method for testing this class.
1912	*
1913	* @param argv should contain one element: the name of an ARFF file
1914	*/
1915	public static void test(String [] argv) {
1916
1917	Instances instances, secondInstances, train, test, transformed, empty;
1918	Instance instance;
1919	Random random = new Random(2);
1920	Reader reader;
1921	int start, num;
1922	double newWeight;
1923	FastVector testAtts, testVals;
1924	int i,j;
1925
1926	try{
1927	if (argv.length > 1) {
1928	throw (new Exception("Usage: Instances [<filename>]"));
1929	}
1930
1931	// Creating set of instances from scratch
1932	testVals = new FastVector(2);
1933	testVals.addElement("first_value");
1934	testVals.addElement("second_value");
1935	testAtts = new FastVector(2);
1936	testAtts.addElement(new Attribute("nominal_attribute", testVals));
1937	testAtts.addElement(new Attribute("numeric_attribute"));
1938	instances = new Instances("test_set", testAtts, 10);
1939	instances.add(new Instance(instances.numAttributes()));
1940	instances.add(new Instance(instances.numAttributes()));
1941	instances.add(new Instance(instances.numAttributes()));
1942	instances.setClassIndex(0);
1943	System.out.println("\nSet of instances created from scratch:\n");
1944	System.out.println(instances);
1945
1946	if (argv.length == 1) {
1947	String filename = argv[0];
1948	reader = new FileReader(filename);
1949
1950	// Read first five instances and print them
1951	System.out.println("\nFirst five instances from file:\n");
1952	instances = new Instances(reader, 1);
1953	instances.setClassIndex(instances.numAttributes() - 1);
1954	i = 0;
1955	while ((i < 5) && (instances.readInstance(reader))) {
1956	i++;
1957	}
1958	System.out.println(instances);
1959
1960	// Read all the instances in the file
1961	reader = new FileReader(filename);
1962	instances = new Instances(reader);
1963
1964	// Make the last attribute be the class
1965	instances.setClassIndex(instances.numAttributes() - 1);
1966
1967	// Print header and instances.
1968	System.out.println("\nDataset:\n");
1969	System.out.println(instances);
1970	System.out.println("\nClass index: "+instances.classIndex());
1971	}
1972
1973	// Test basic methods based on class index.
1974	System.out.println("\nClass name: "+instances.classAttribute().name());
1975	System.out.println("\nClass index: "+instances.classIndex());
1976	System.out.println("\nClass is nominal: " +
1977	instances.classAttribute().isNominal());
1978	System.out.println("\nClass is numeric: " +
1979	instances.classAttribute().isNumeric());
1980	System.out.println("\nClasses:\n");
1981	for (i = 0; i < instances.numClasses(); i++) {
1982	System.out.println(instances.classAttribute().value(i));
1983	}
1984	System.out.println("\nClass values and labels of instances:\n");
1985	for (i = 0; i < instances.numInstances(); i++) {
1986	Instance inst = instances.instance(i);
1987	System.out.print(inst.classValue() + "\t");
1988	System.out.print(inst.toString(inst.classIndex()));
1989	if (instances.instance(i).classIsMissing()) {
1990	System.out.println("\tis missing");
1991	} else {
1992	System.out.println();
1993	}
1994	}
1995
1996	// Create random weights.
1997	System.out.println("\nCreating random weights for instances.");
1998	for (i = 0; i < instances.numInstances(); i++) {
1999	instances.instance(i).setWeight(random.nextDouble());
2000	}
2001
2002	// Print all instances and their weights (and the sum of weights).
2003	System.out.println("\nInstances and their weights:\n");
2004	System.out.println(instances.instancesAndWeights());
2005	System.out.print("\nSum of weights: ");
2006	System.out.println(instances.sumOfWeights());
2007
2008	// Insert an attribute
2009	secondInstances = new Instances(instances);
2010	Attribute testAtt = new Attribute("Inserted");
2011	secondInstances.insertAttributeAt(testAtt, 0);
2012	System.out.println("\nSet with inserted attribute:\n");
2013	System.out.println(secondInstances);
2014	System.out.println("\nClass name: "
2015	+ secondInstances.classAttribute().name());
2016
2017	// Delete the attribute
2018	secondInstances.deleteAttributeAt(0);
2019	System.out.println("\nSet with attribute deleted:\n");
2020	System.out.println(secondInstances);
2021	System.out.println("\nClass name: "
2022	+ secondInstances.classAttribute().name());
2023
2024	// Test if headers are equal
2025	System.out.println("\nHeaders equal: "+
2026	instances.equalHeaders(secondInstances) + "\n");
2027
2028	// Print data in internal format.
2029	System.out.println("\nData (internal values):\n");
2030	for (i = 0; i < instances.numInstances(); i++) {
2031	for (j = 0; j < instances.numAttributes(); j++) {
2032	if (instances.instance(i).isMissing(j)) {
2033	System.out.print("? ");
2034	} else {
2035	System.out.print(instances.instance(i).value(j) + " ");
2036	}
2037	}
2038	System.out.println();
2039	}
2040
2041	// Just print header
2042	System.out.println("\nEmpty dataset:\n");
2043	empty = new Instances(instances, 0);
2044	System.out.println(empty);
2045	System.out.println("\nClass name: "+empty.classAttribute().name());
2046
2047	// Create copy and rename an attribute and a value (if possible)
2048	if (empty.classAttribute().isNominal()) {
2049	Instances copy = new Instances(empty, 0);
2050	copy.renameAttribute(copy.classAttribute(), "new_name");
2051	copy.renameAttributeValue(copy.classAttribute(),
2052	copy.classAttribute().value(0),
2053	"new_val_name");
2054	System.out.println("\nDataset with names changed:\n" + copy);
2055	System.out.println("\nOriginal dataset:\n" + empty);
2056	}
2057
2058	// Create and prints subset of instances.
2059	start = instances.numInstances() / 4;
2060	num = instances.numInstances() / 2;
2061	System.out.print("\nSubset of dataset: ");
2062	System.out.println(num + " instances from " + (start + 1)
2063	+ ". instance");
2064	secondInstances = new Instances(instances, start, num);
2065	System.out.println("\nClass name: "
2066	+ secondInstances.classAttribute().name());
2067
2068	// Print all instances and their weights (and the sum of weights).
2069	System.out.println("\nInstances and their weights:\n");
2070	System.out.println(secondInstances.instancesAndWeights());
2071	System.out.print("\nSum of weights: ");
2072	System.out.println(secondInstances.sumOfWeights());
2073
2074	// Create and print training and test sets for 3-fold
2075	// cross-validation.
2076	System.out.println("\nTrain and test folds for 3-fold CV:");
2077	if (instances.classAttribute().isNominal()) {
2078	instances.stratify(3);
2079	}
2080	for (j = 0; j < 3; j++) {
2081	train = instances.trainCV(3,j);
2082	test = instances.testCV(3,j);
2083
2084	// Print all instances and their weights (and the sum of weights).
2085	System.out.println("\nTrain: ");
2086	System.out.println("\nInstances and their weights:\n");
2087	System.out.println(train.instancesAndWeights());
2088	System.out.print("\nSum of weights: ");
2089	System.out.println(train.sumOfWeights());
2090	System.out.println("\nClass name: "+train.classAttribute().name());
2091	System.out.println("\nTest: ");
2092	System.out.println("\nInstances and their weights:\n");
2093	System.out.println(test.instancesAndWeights());
2094	System.out.print("\nSum of weights: ");
2095	System.out.println(test.sumOfWeights());
2096	System.out.println("\nClass name: "+test.classAttribute().name());
2097	}
2098
2099	// Randomize instances and print them.
2100	System.out.println("\nRandomized dataset:");
2101	instances.randomize(random);
2102
2103	// Print all instances and their weights (and the sum of weights).
2104	System.out.println("\nInstances and their weights:\n");
2105	System.out.println(instances.instancesAndWeights());
2106	System.out.print("\nSum of weights: ");
2107	System.out.println(instances.sumOfWeights());
2108
2109	// Sort instances according to first attribute and
2110	// print them.
2111	System.out.print("\nInstances sorted according to first attribute:\n ");
2112	instances.sort(0);
2113
2114	// Print all instances and their weights (and the sum of weights).
2115	System.out.println("\nInstances and their weights:\n");
2116	System.out.println(instances.instancesAndWeights());
2117	System.out.print("\nSum of weights: ");
2118	System.out.println(instances.sumOfWeights());
2119	} catch (Exception e) {
2120	e.printStackTrace();
2121	}
2122	}
2123
2124	/**
2125	* Main method for this class -- just prints a summary of a set
2126	* of instances.
2127	*
2128	* @param argv should contain one element: the name of an ARFF file
2129	*/
2130	public static void main(String [] args) {
2131
2132	try {
2133	Reader r = null;
2134	if (args.length > 1) {
2135	throw (new Exception("Usage: Instances <filename>"));
2136	} else if (args.length == 0) {
2137	r = new BufferedReader(new InputStreamReader(System.in));
2138	} else {
2139	r = new BufferedReader(new FileReader(args[0]));
2140	}
2141	Instances i = new Instances(r);
2142	System.out.println(i.toSummaryString());
2143	} catch (Exception ex) {
2144	System.err.println(ex.getMessage());
2145	}
2146	}
2147	}
2148
2149
2150

Note: See TracBrowser for help on using the repository browser.

Download in other formats: