1 | /*
|
---|
2 | * This program is free software; you can redistribute it and/or modify
|
---|
3 | * it under the terms of the GNU General Public License as published by
|
---|
4 | * the Free Software Foundation; either version 2 of the License, or
|
---|
5 | * (at your option) any later version.
|
---|
6 | *
|
---|
7 | * This program is distributed in the hope that it will be useful,
|
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
10 | * GNU General Public License for more details.
|
---|
11 | *
|
---|
12 | * You should have received a copy of the GNU General Public License
|
---|
13 | * along with this program; if not, write to the Free Software
|
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
15 | */
|
---|
16 |
|
---|
17 | /*
|
---|
18 | * Instances.java
|
---|
19 | * Copyright (C) 1999 Eibe Frank
|
---|
20 | *
|
---|
21 | */
|
---|
22 |
|
---|
23 | package weka.core;
|
---|
24 |
|
---|
25 | import java.io.*;
|
---|
26 | import java.util.*;
|
---|
27 |
|
---|
28 | /**
|
---|
29 | * Class for handling an ordered set of weighted instances. <p>
|
---|
30 | *
|
---|
31 | * Typical usage (code from the main() method of this class): <p>
|
---|
32 | *
|
---|
33 | * <code>
|
---|
34 | * ... <br>
|
---|
35 | *
|
---|
36 | * // Read all the instances in the file <br>
|
---|
37 | * reader = new FileReader(filename); <br>
|
---|
38 | * instances = new Instances(reader); <br><br>
|
---|
39 | *
|
---|
40 | * // Make the last attribute be the class <br>
|
---|
41 | * instances.setClassIndex(instances.numAttributes() - 1); <br><br>
|
---|
42 | *
|
---|
43 | * // Print header and instances. <br>
|
---|
44 | * System.out.println("\nDataset:\n"); <br>
|
---|
45 | * System.out.println(instances); <br><br>
|
---|
46 | *
|
---|
47 | * ... <br>
|
---|
48 | * </code><p>
|
---|
49 | *
|
---|
50 | * All methods that change a set of instances are safe, ie. a change
|
---|
51 | * of a set of instances does not affect any other sets of
|
---|
52 | * instances. All methods that change a datasets's attribute
|
---|
53 | * information clone the dataset before it is changed.
|
---|
54 | *
|
---|
55 | * @author Eibe Frank ([email protected])
|
---|
56 | * @author Len Trigg ([email protected])
|
---|
57 | * @version $Revision: 8815 $
|
---|
58 | */
|
---|
59 | public class Instances implements Serializable {
|
---|
60 |
|
---|
61 | /** The filename extension that should be used for arff files */
|
---|
62 | public static String FILE_EXTENSION = ".arff";
|
---|
63 |
|
---|
64 | /** The dataset's name. */
|
---|
65 | protected String m_RelationName;
|
---|
66 |
|
---|
67 | /** The attribute information. */
|
---|
68 | protected FastVector m_Attributes;
|
---|
69 |
|
---|
70 | /** The instances. */
|
---|
71 | protected FastVector m_Instances;
|
---|
72 |
|
---|
73 | /** The class attribute's index */
|
---|
74 | protected int m_ClassIndex;
|
---|
75 |
|
---|
76 | /** Buffer of values for sparse instance */
|
---|
77 | protected double[] m_ValueBuffer;
|
---|
78 |
|
---|
79 | /** Buffer of indices for sparse instance */
|
---|
80 | protected int[] m_IndicesBuffer;
|
---|
81 |
|
---|
82 | /**
|
---|
83 | * Reads an ARFF file from a reader, and assigns a weight of
|
---|
84 | * one to each instance. Lets the index of the class
|
---|
85 | * attribute be undefined (negative).
|
---|
86 | *
|
---|
87 | * @param reader the reader
|
---|
88 | * @exception IOException if the ARFF file is not read
|
---|
89 | * successfully
|
---|
90 | */
|
---|
91 | public Instances(Reader reader) throws IOException {
|
---|
92 |
|
---|
93 | StreamTokenizer tokenizer;
|
---|
94 |
|
---|
95 | tokenizer = new StreamTokenizer(reader);
|
---|
96 | initTokenizer(tokenizer);
|
---|
97 | readHeader(tokenizer);
|
---|
98 | m_ClassIndex = -1;
|
---|
99 | m_Instances = new FastVector(1000);
|
---|
100 | while (getInstance(tokenizer, true)) {};
|
---|
101 | compactify();
|
---|
102 | }
|
---|
103 |
|
---|
104 | /**
|
---|
105 | * Reads the header of an ARFF file from a reader and
|
---|
106 | * reserves space for the given number of instances. Lets
|
---|
107 | * the class index be undefined (negative).
|
---|
108 | *
|
---|
109 | * @param reader the reader
|
---|
110 | * @param capacity the capacity
|
---|
111 | * @exception IllegalArgumentException if the header is not read successfully
|
---|
112 | * or the capacity is negative.
|
---|
113 | * @exception IOException if there is a problem with the reader.
|
---|
114 | */
|
---|
115 | public Instances(Reader reader, int capacity) throws IOException {
|
---|
116 |
|
---|
117 | StreamTokenizer tokenizer;
|
---|
118 |
|
---|
119 | if (capacity < 0) {
|
---|
120 | throw new IllegalArgumentException("Capacity has to be positive!");
|
---|
121 | }
|
---|
122 | tokenizer = new StreamTokenizer(reader);
|
---|
123 | initTokenizer(tokenizer);
|
---|
124 | readHeader(tokenizer);
|
---|
125 | m_ClassIndex = -1;
|
---|
126 | m_Instances = new FastVector(capacity);
|
---|
127 | }
|
---|
128 |
|
---|
129 | /**
|
---|
130 | * Constructor copying all instances and references to
|
---|
131 | * the header information from the given set of instances.
|
---|
132 | *
|
---|
133 | * @param instances the set to be copied
|
---|
134 | */
|
---|
135 | public Instances(Instances dataset) {
|
---|
136 |
|
---|
137 | this(dataset, dataset.numInstances());
|
---|
138 |
|
---|
139 | dataset.copyInstances(0, this, dataset.numInstances());
|
---|
140 | }
|
---|
141 |
|
---|
142 | /**
|
---|
143 | * Constructor creating an empty set of instances. Copies references
|
---|
144 | * to the header information from the given set of instances. Sets
|
---|
145 | * the capacity of the set of instances to 0 if its negative.
|
---|
146 | *
|
---|
147 | * @param instances the instances from which the header
|
---|
148 | * information is to be taken
|
---|
149 | * @param capacity the capacity of the new dataset
|
---|
150 | */
|
---|
151 | public Instances(Instances dataset, int capacity) {
|
---|
152 |
|
---|
153 | if (capacity < 0) {
|
---|
154 | capacity = 0;
|
---|
155 | }
|
---|
156 |
|
---|
157 | // Strings only have to be "shallow" copied because
|
---|
158 | // they can't be modified.
|
---|
159 | m_ClassIndex = dataset.m_ClassIndex;
|
---|
160 | m_RelationName = dataset.m_RelationName;
|
---|
161 | m_Attributes = dataset.m_Attributes;
|
---|
162 | m_Instances = new FastVector(capacity);
|
---|
163 | }
|
---|
164 |
|
---|
165 | /**
|
---|
166 | * Creates a new set of instances by copying a
|
---|
167 | * subset of another set.
|
---|
168 | *
|
---|
169 | * @param source the set of instances from which a subset
|
---|
170 | * is to be created
|
---|
171 | * @param first the index of the first instance to be copied
|
---|
172 | * @param toCopy the number of instances to be copied
|
---|
173 | * @exception IllegalArgumentException if first and toCopy are out of range
|
---|
174 | */
|
---|
175 | public Instances(Instances source, int first, int toCopy) {
|
---|
176 |
|
---|
177 | this(source, toCopy);
|
---|
178 |
|
---|
179 | if ((first < 0) || ((first + toCopy) > source.numInstances())) {
|
---|
180 | throw new IllegalArgumentException("Parameters first and/or toCopy out "+
|
---|
181 | "of range");
|
---|
182 | }
|
---|
183 | source.copyInstances(first, this, toCopy);
|
---|
184 | }
|
---|
185 |
|
---|
186 | /**
|
---|
187 | * Creates an empty set of instances. Uses the given
|
---|
188 | * attribute information. Sets the capacity of the set of
|
---|
189 | * instances to 0 if its negative. Given attribute information
|
---|
190 | * must not be changed after this constructor has been used.
|
---|
191 | *
|
---|
192 | * @param name the name of the relation
|
---|
193 | * @param attInfo the attribute information
|
---|
194 | * @param capacity the capacity of the set
|
---|
195 | */
|
---|
196 | public Instances(String name, FastVector attInfo, int capacity) {
|
---|
197 |
|
---|
198 | m_RelationName = name;
|
---|
199 | m_ClassIndex = -1;
|
---|
200 | m_Attributes = attInfo;
|
---|
201 | for (int i = 0; i < numAttributes(); i++) {
|
---|
202 | attribute(i).setIndex(i);
|
---|
203 | }
|
---|
204 | m_Instances = new FastVector(capacity);
|
---|
205 | }
|
---|
206 |
|
---|
207 | /**
|
---|
208 | * Create a copy of the structure, but "cleanse" string types (i.e.
|
---|
209 | * doesn't contain references to the strings seen in the past).
|
---|
210 | *
|
---|
211 | * @return a copy of the instance structure.
|
---|
212 | */
|
---|
213 | public Instances stringFreeStructure() {
|
---|
214 |
|
---|
215 | FastVector atts = (FastVector)m_Attributes.copy();
|
---|
216 | for (int i = 0 ; i < atts.size(); i++) {
|
---|
217 | Attribute att = (Attribute)atts.elementAt(i);
|
---|
218 | if (att.type() == Attribute.STRING) {
|
---|
219 | atts.setElementAt(new Attribute(att.name(), null), i);
|
---|
220 | }
|
---|
221 | }
|
---|
222 | Instances result = new Instances(relationName(), atts, 0);
|
---|
223 | result.m_ClassIndex = m_ClassIndex;
|
---|
224 | return result;
|
---|
225 | }
|
---|
226 |
|
---|
227 | /**
|
---|
228 | * Adds one instance to the end of the set.
|
---|
229 | * Shallow copies instance before it is added. Increases the
|
---|
230 | * size of the dataset if it is not large enough. Does not
|
---|
231 | * check if the instance is compatible with the dataset.
|
---|
232 | *
|
---|
233 | * @param instance the instance to be added
|
---|
234 | */
|
---|
235 | public final void add(Instance instance) {
|
---|
236 |
|
---|
237 | Instance newInstance = (Instance)instance.copy();
|
---|
238 |
|
---|
239 | newInstance.setDataset(this);
|
---|
240 | m_Instances.addElement(newInstance);
|
---|
241 | }
|
---|
242 |
|
---|
243 | /**
|
---|
244 | * Returns an attribute.
|
---|
245 | *
|
---|
246 | * @param index the attribute's index
|
---|
247 | * @return the attribute at the given position
|
---|
248 | */
|
---|
249 | public final Attribute attribute(int index) {
|
---|
250 |
|
---|
251 | return (Attribute) m_Attributes.elementAt(index);
|
---|
252 | }
|
---|
253 |
|
---|
254 | /**
|
---|
255 | * Returns an attribute given its name. If there is more than
|
---|
256 | * one attribute with the same name, it returns the first one.
|
---|
257 | * Returns null if the attribute can't be found.
|
---|
258 | *
|
---|
259 | * @param name the attribute's name
|
---|
260 | * @return the attribute with the given name, null if the
|
---|
261 | * attribute can't be found
|
---|
262 | */
|
---|
263 | public final Attribute attribute(String name) {
|
---|
264 |
|
---|
265 | for (int i = 0; i < numAttributes(); i++) {
|
---|
266 | if (attribute(i).name().equals(name)) {
|
---|
267 | return attribute(i);
|
---|
268 | }
|
---|
269 | }
|
---|
270 | return null;
|
---|
271 | }
|
---|
272 |
|
---|
273 | /**
|
---|
274 | * Checks for string attributes in the dataset
|
---|
275 | *
|
---|
276 | * @return true if string attributes are present, false otherwise
|
---|
277 | */
|
---|
278 | public boolean checkForStringAttributes() {
|
---|
279 |
|
---|
280 | int i = 0;
|
---|
281 |
|
---|
282 | while (i < m_Attributes.size()) {
|
---|
283 | if (attribute(i++).isString()) {
|
---|
284 | return true;
|
---|
285 | }
|
---|
286 | }
|
---|
287 | return false;
|
---|
288 | }
|
---|
289 |
|
---|
290 | /**
|
---|
291 | * Checks if the given instance is compatible
|
---|
292 | * with this dataset. Only looks at the size of
|
---|
293 | * the instance and the ranges of the values for
|
---|
294 | * nominal and string attributes.
|
---|
295 | *
|
---|
296 | * @return true if the instance is compatible with the dataset
|
---|
297 | */
|
---|
298 | public final boolean checkInstance(Instance instance) {
|
---|
299 |
|
---|
300 | if (instance.numAttributes() != numAttributes()) {
|
---|
301 | return false;
|
---|
302 | }
|
---|
303 | for (int i = 0; i < numAttributes(); i++) {
|
---|
304 | if (instance.isMissing(i)) {
|
---|
305 | continue;
|
---|
306 | } else if (attribute(i).isNominal() ||
|
---|
307 | attribute(i).isString()) {
|
---|
308 | if (!(Utils.eq(instance.value(i),
|
---|
309 | (double)(int)instance.value(i)))) {
|
---|
310 | return false;
|
---|
311 | } else if (Utils.sm(instance.value(i), 0) ||
|
---|
312 | Utils.gr(instance.value(i),
|
---|
313 | attribute(i).numValues())) {
|
---|
314 | return false;
|
---|
315 | }
|
---|
316 | }
|
---|
317 | }
|
---|
318 | return true;
|
---|
319 | }
|
---|
320 |
|
---|
321 | /**
|
---|
322 | * Returns the class attribute.
|
---|
323 | *
|
---|
324 | * @return the class attribute
|
---|
325 | * @exception UnassignedClassException if the class is not set
|
---|
326 | */
|
---|
327 | public final Attribute classAttribute() {
|
---|
328 |
|
---|
329 | if (m_ClassIndex < 0) {
|
---|
330 | throw new UnassignedClassException("Class index is negative (not set)!");
|
---|
331 | }
|
---|
332 | return attribute(m_ClassIndex);
|
---|
333 | }
|
---|
334 |
|
---|
335 | /**
|
---|
336 | * Returns the class attribute's index. Returns negative number
|
---|
337 | * if it's undefined.
|
---|
338 | *
|
---|
339 | * @return the class index as an integer
|
---|
340 | */
|
---|
341 | public final int classIndex() {
|
---|
342 |
|
---|
343 | return m_ClassIndex;
|
---|
344 | }
|
---|
345 |
|
---|
346 | /**
|
---|
347 | * Compactifies the set of instances. Decreases the capacity of
|
---|
348 | * the set so that it matches the number of instances in the set.
|
---|
349 | */
|
---|
350 | public final void compactify() {
|
---|
351 |
|
---|
352 | m_Instances.trimToSize();
|
---|
353 | }
|
---|
354 |
|
---|
355 | /**
|
---|
356 | * Removes all instances from the set.
|
---|
357 | */
|
---|
358 | public final void delete() {
|
---|
359 |
|
---|
360 | m_Instances = new FastVector();
|
---|
361 | }
|
---|
362 |
|
---|
363 | /**
|
---|
364 | * Removes an instance at the given position from the set.
|
---|
365 | *
|
---|
366 | * @param index the instance's position
|
---|
367 | */
|
---|
368 | public final void delete(int index) {
|
---|
369 |
|
---|
370 | m_Instances.removeElementAt(index);
|
---|
371 | }
|
---|
372 |
|
---|
373 | /**
|
---|
374 | * Deletes an attribute at the given position
|
---|
375 | * (0 to numAttributes() - 1). A deep copy of the attribute
|
---|
376 | * information is performed before the attribute is deleted.
|
---|
377 | *
|
---|
378 | * @param pos the attribute's position
|
---|
379 | * @exception IllegalArgumentException if the given index is out of range or the
|
---|
380 | * class attribute is being deleted
|
---|
381 | */
|
---|
382 | public void deleteAttributeAt(int position) {
|
---|
383 |
|
---|
384 | if ((position < 0) || (position >= m_Attributes.size())) {
|
---|
385 | throw new IllegalArgumentException("Index out of range");
|
---|
386 | }
|
---|
387 | if (position == m_ClassIndex) {
|
---|
388 | throw new IllegalArgumentException("Can't delete class attribute");
|
---|
389 | }
|
---|
390 | freshAttributeInfo();
|
---|
391 | if (m_ClassIndex > position) {
|
---|
392 | m_ClassIndex--;
|
---|
393 | }
|
---|
394 | m_Attributes.removeElementAt(position);
|
---|
395 | for (int i = position; i < m_Attributes.size(); i++) {
|
---|
396 | Attribute current = (Attribute)m_Attributes.elementAt(i);
|
---|
397 | current.setIndex(current.index() - 1);
|
---|
398 | }
|
---|
399 | for (int i = 0; i < numInstances(); i++) {
|
---|
400 | instance(i).forceDeleteAttributeAt(position);
|
---|
401 | }
|
---|
402 | }
|
---|
403 |
|
---|
404 | /**
|
---|
405 | * Deletes all string attributes in the dataset. A deep copy of the attribute
|
---|
406 | * information is performed before an attribute is deleted.
|
---|
407 | *
|
---|
408 | * @exception IllegalArgumentException if string attribute couldn't be
|
---|
409 | * successfully deleted (probably because it is the class attribute).
|
---|
410 | */
|
---|
411 | public void deleteStringAttributes() {
|
---|
412 |
|
---|
413 | int i = 0;
|
---|
414 | while (i < m_Attributes.size()) {
|
---|
415 | if (attribute(i).isString()) {
|
---|
416 | deleteAttributeAt(i);
|
---|
417 | } else {
|
---|
418 | i++;
|
---|
419 | }
|
---|
420 | }
|
---|
421 | }
|
---|
422 |
|
---|
423 | /**
|
---|
424 | * Removes all instances with missing values for a particular
|
---|
425 | * attribute from the dataset.
|
---|
426 | *
|
---|
427 | * @param attIndex the attribute's index
|
---|
428 | */
|
---|
429 | public final void deleteWithMissing(int attIndex) {
|
---|
430 |
|
---|
431 | FastVector newInstances = new FastVector(numInstances());
|
---|
432 |
|
---|
433 | for (int i = 0; i < numInstances(); i++) {
|
---|
434 | if (!instance(i).isMissing(attIndex)) {
|
---|
435 | newInstances.addElement(instance(i));
|
---|
436 | }
|
---|
437 | }
|
---|
438 | m_Instances = newInstances;
|
---|
439 | }
|
---|
440 |
|
---|
441 | /**
|
---|
442 | * Removes all instances with missing values for a particular
|
---|
443 | * attribute from the dataset.
|
---|
444 | *
|
---|
445 | * @param att the attribute
|
---|
446 | */
|
---|
447 | public final void deleteWithMissing(Attribute att) {
|
---|
448 |
|
---|
449 | deleteWithMissing(att.index());
|
---|
450 | }
|
---|
451 |
|
---|
452 | /**
|
---|
453 | * Removes all instances with a missing class value
|
---|
454 | * from the dataset.
|
---|
455 | *
|
---|
456 | * @exception UnassignedClassException if class is not set
|
---|
457 | */
|
---|
458 | public final void deleteWithMissingClass() {
|
---|
459 |
|
---|
460 | if (m_ClassIndex < 0) {
|
---|
461 | throw new UnassignedClassException("Class index is negative (not set)!");
|
---|
462 | }
|
---|
463 | deleteWithMissing(m_ClassIndex);
|
---|
464 | }
|
---|
465 |
|
---|
466 | /**
|
---|
467 | * Returns an enumeration of all the attributes.
|
---|
468 | *
|
---|
469 | * @return enumeration of all the attributes.
|
---|
470 | */
|
---|
471 | public Enumeration enumerateAttributes() {
|
---|
472 |
|
---|
473 | return m_Attributes.elements(m_ClassIndex);
|
---|
474 | }
|
---|
475 |
|
---|
476 | /**
|
---|
477 | * Returns an enumeration of all instances in the dataset.
|
---|
478 | *
|
---|
479 | * @return enumeration of all instances in the dataset
|
---|
480 | */
|
---|
481 | public final Enumeration enumerateInstances() {
|
---|
482 |
|
---|
483 | return m_Instances.elements();
|
---|
484 | }
|
---|
485 |
|
---|
486 | /**
|
---|
487 | * Checks if two headers are equivalent.
|
---|
488 | *
|
---|
489 | * @param dataset another dataset
|
---|
490 | * @return true if the header of the given dataset is equivalent
|
---|
491 | * to this header
|
---|
492 | */
|
---|
493 | public final boolean equalHeaders(Instances dataset){
|
---|
494 |
|
---|
495 | // Check class and all attributes
|
---|
496 | if (m_ClassIndex != dataset.m_ClassIndex) {
|
---|
497 | return false;
|
---|
498 | }
|
---|
499 | if (m_Attributes.size() != dataset.m_Attributes.size()) {
|
---|
500 | return false;
|
---|
501 | }
|
---|
502 | for (int i = 0; i < m_Attributes.size(); i++) {
|
---|
503 | if (!(attribute(i).equals(dataset.attribute(i)))) {
|
---|
504 | return false;
|
---|
505 | }
|
---|
506 | }
|
---|
507 | return true;
|
---|
508 | }
|
---|
509 |
|
---|
510 | /**
|
---|
511 | * Returns the first instance in the set.
|
---|
512 | *
|
---|
513 | * @return the first instance in the set
|
---|
514 | */
|
---|
515 | public final Instance firstInstance() {
|
---|
516 |
|
---|
517 | return (Instance)m_Instances.firstElement();
|
---|
518 | }
|
---|
519 |
|
---|
520 | /**
|
---|
521 | * Inserts an attribute at the given position (0 to
|
---|
522 | * numAttributes()) and sets all values to be missing.
|
---|
523 | * Shallow copies the attribute before it is inserted, and performs
|
---|
524 | * a deep copy of the existing attribute information.
|
---|
525 | *
|
---|
526 | * @param att the attribute to be inserted
|
---|
527 | * @param pos the attribute's position
|
---|
528 | * @exception IllegalArgumentException if the given index is out of range
|
---|
529 | */
|
---|
530 | public void insertAttributeAt(Attribute att, int position) {
|
---|
531 |
|
---|
532 | if ((position < 0) ||
|
---|
533 | (position > m_Attributes.size())) {
|
---|
534 | throw new IllegalArgumentException("Index out of range");
|
---|
535 | }
|
---|
536 | att = (Attribute)att.copy();
|
---|
537 | freshAttributeInfo();
|
---|
538 | att.setIndex(position);
|
---|
539 | m_Attributes.insertElementAt(att, position);
|
---|
540 | for (int i = position + 1; i < m_Attributes.size(); i++) {
|
---|
541 | Attribute current = (Attribute)m_Attributes.elementAt(i);
|
---|
542 | current.setIndex(current.index() + 1);
|
---|
543 | }
|
---|
544 | for (int i = 0; i < numInstances(); i++) {
|
---|
545 | instance(i).forceInsertAttributeAt(position);
|
---|
546 | }
|
---|
547 | if (m_ClassIndex >= position) {
|
---|
548 | m_ClassIndex++;
|
---|
549 | }
|
---|
550 | }
|
---|
551 |
|
---|
552 | /**
|
---|
553 | * Returns the instance at the given position.
|
---|
554 | *
|
---|
555 | * @param index the instance's index
|
---|
556 | * @return the instance at the given position
|
---|
557 | */
|
---|
558 | public final Instance instance(int index) {
|
---|
559 |
|
---|
560 | return (Instance)m_Instances.elementAt(index);
|
---|
561 | }
|
---|
562 |
|
---|
563 | /**
|
---|
564 | * Returns the last instance in the set.
|
---|
565 | *
|
---|
566 | * @return the last instance in the set
|
---|
567 | */
|
---|
568 | public final Instance lastInstance() {
|
---|
569 |
|
---|
570 | return (Instance)m_Instances.lastElement();
|
---|
571 | }
|
---|
572 |
|
---|
573 | /**
|
---|
574 | * Returns the mean (mode) for a numeric (nominal) attribute as
|
---|
575 | * a floating-point value. Returns 0 if the attribute is neither nominal nor
|
---|
576 | * numeric. If all values are missing it returns zero.
|
---|
577 | *
|
---|
578 | * @param attIndex the attribute's index
|
---|
579 | * @return the mean or the mode
|
---|
580 | */
|
---|
581 | public final double meanOrMode(int attIndex) {
|
---|
582 |
|
---|
583 | double result, found;
|
---|
584 | int [] counts;
|
---|
585 |
|
---|
586 | if (attribute(attIndex).isNumeric()) {
|
---|
587 | result = found = 0;
|
---|
588 | for (int j = 0; j < numInstances(); j++) {
|
---|
589 | if (!instance(j).isMissing(attIndex)) {
|
---|
590 | found += instance(j).weight();
|
---|
591 | result += instance(j).weight()*instance(j).value(attIndex);
|
---|
592 | }
|
---|
593 | }
|
---|
594 | if (Utils.eq(found, 0)) {
|
---|
595 | return 0;
|
---|
596 | } else {
|
---|
597 | return result / found;
|
---|
598 | }
|
---|
599 | } else if (attribute(attIndex).isNominal()) {
|
---|
600 | counts = new int[attribute(attIndex).numValues()];
|
---|
601 | for (int j = 0; j < numInstances(); j++) {
|
---|
602 | if (!instance(j).isMissing(attIndex)) {
|
---|
603 | counts[(int) instance(j).value(attIndex)] += instance(j).weight();
|
---|
604 | }
|
---|
605 | }
|
---|
606 | return (double)Utils.maxIndex(counts);
|
---|
607 | } else {
|
---|
608 | return 0;
|
---|
609 | }
|
---|
610 | }
|
---|
611 |
|
---|
612 | /**
|
---|
613 | * Returns the mean (mode) for a numeric (nominal) attribute as a
|
---|
614 | * floating-point value. Returns 0 if the attribute is neither
|
---|
615 | * nominal nor numeric. If all values are missing it returns zero.
|
---|
616 | *
|
---|
617 | * @param att the attribute
|
---|
618 | * @return the mean or the mode
|
---|
619 | */
|
---|
620 | public final double meanOrMode(Attribute att) {
|
---|
621 |
|
---|
622 | return meanOrMode(att.index());
|
---|
623 | }
|
---|
624 |
|
---|
625 | /**
|
---|
626 | * Returns the number of attributes.
|
---|
627 | *
|
---|
628 | * @return the number of attributes as an integer
|
---|
629 | */
|
---|
630 | public final int numAttributes() {
|
---|
631 |
|
---|
632 | return m_Attributes.size();
|
---|
633 | }
|
---|
634 |
|
---|
635 | /**
|
---|
636 | * Returns the number of class labels.
|
---|
637 | *
|
---|
638 | * @return the number of class labels as an integer if the class
|
---|
639 | * attribute is nominal, 1 otherwise.
|
---|
640 | * @exception UnassignedClassException if the class is not set
|
---|
641 | */
|
---|
642 | public final int numClasses() {
|
---|
643 |
|
---|
644 | if (m_ClassIndex < 0) {
|
---|
645 | throw new UnassignedClassException("Class index is negative (not set)!");
|
---|
646 | }
|
---|
647 | if (!classAttribute().isNominal()) {
|
---|
648 | return 1;
|
---|
649 | } else {
|
---|
650 | return classAttribute().numValues();
|
---|
651 | }
|
---|
652 | }
|
---|
653 |
|
---|
654 | /**
|
---|
655 | * Returns the number of distinct values of a given attribute.
|
---|
656 | * Returns the number of instances if the attribute is a
|
---|
657 | * string attribute. The value 'missing' is not counted.
|
---|
658 | *
|
---|
659 | * @param attIndex the attribute
|
---|
660 | * @return the number of distinct values of a given attribute
|
---|
661 | */
|
---|
662 | public final int numDistinctValues(int attIndex) {
|
---|
663 |
|
---|
664 | if (attribute(attIndex).isNumeric()) {
|
---|
665 | double [] attVals = attributeToDoubleArray(attIndex);
|
---|
666 | int [] sorted = Utils.sort(attVals);
|
---|
667 | double prev = 0;
|
---|
668 | int counter = 0;
|
---|
669 | for (int i = 0; i < sorted.length; i++) {
|
---|
670 | Instance current = instance(sorted[i]);
|
---|
671 | if (current.isMissing(attIndex)) {
|
---|
672 | break;
|
---|
673 | }
|
---|
674 | if ((i == 0) ||
|
---|
675 | Utils.gr(current.value(attIndex), prev)) {
|
---|
676 | prev = current.value(attIndex);
|
---|
677 | counter++;
|
---|
678 | }
|
---|
679 | }
|
---|
680 | return counter;
|
---|
681 | } else {
|
---|
682 | return attribute(attIndex).numValues();
|
---|
683 | }
|
---|
684 | }
|
---|
685 |
|
---|
686 | /**
|
---|
687 | * Returns the number of distinct values of a given attribute.
|
---|
688 | * Returns the number of instances if the attribute is a
|
---|
689 | * string attribute. The value 'missing' is not counted.
|
---|
690 | *
|
---|
691 | * @param att the attribute
|
---|
692 | * @return the number of distinct values of a given attribute
|
---|
693 | */
|
---|
694 | public final int numDistinctValues(Attribute att) {
|
---|
695 |
|
---|
696 | return numDistinctValues(att.index());
|
---|
697 | }
|
---|
698 |
|
---|
699 | /**
|
---|
700 | * Returns the number of instances in the dataset.
|
---|
701 | *
|
---|
702 | * @return the number of instances in the dataset as an integer
|
---|
703 | */
|
---|
704 | public final int numInstances() {
|
---|
705 |
|
---|
706 | return m_Instances.size();
|
---|
707 | }
|
---|
708 |
|
---|
709 | /**
|
---|
710 | * Shuffles the instances in the set so that they are ordered
|
---|
711 | * randomly.
|
---|
712 | *
|
---|
713 | * @param random a random number generator
|
---|
714 | */
|
---|
715 | public final void randomize(Random random) {
|
---|
716 |
|
---|
717 | for (int j = numInstances() - 1; j > 0; j--)
|
---|
718 | swap(j,(int)(random.nextDouble()*(double)j));
|
---|
719 | }
|
---|
720 |
|
---|
721 | /**
|
---|
722 | * Reads a single instance from the reader and appends it
|
---|
723 | * to the dataset. Automatically expands the dataset if it
|
---|
724 | * is not large enough to hold the instance. This method does
|
---|
725 | * not check for carriage return at the end of the line.
|
---|
726 | *
|
---|
727 | * @param reader the reader
|
---|
728 | * @return false if end of file has been reached
|
---|
729 | * @exception IOException if the information is not read
|
---|
730 | * successfully
|
---|
731 | */
|
---|
732 | public final boolean readInstance(Reader reader)
|
---|
733 | throws IOException {
|
---|
734 |
|
---|
735 | StreamTokenizer tokenizer = new StreamTokenizer(reader);
|
---|
736 |
|
---|
737 | initTokenizer(tokenizer);
|
---|
738 | return getInstance(tokenizer, false);
|
---|
739 | }
|
---|
740 |
|
---|
741 | /**
|
---|
742 | * Returns the relation's name.
|
---|
743 | *
|
---|
744 | * @return the relation's name as a string
|
---|
745 | */
|
---|
746 | public final String relationName() {
|
---|
747 |
|
---|
748 | return m_RelationName;
|
---|
749 | }
|
---|
750 |
|
---|
751 | /**
|
---|
752 | * Renames an attribute. This change only affects this
|
---|
753 | * dataset.
|
---|
754 | *
|
---|
755 | * @param att the attribute's index
|
---|
756 | * @param name the new name
|
---|
757 | */
|
---|
758 | public final void renameAttribute(int att, String name) {
|
---|
759 |
|
---|
760 | Attribute newAtt = attribute(att).copy(name);
|
---|
761 | FastVector newVec = new FastVector(numAttributes());
|
---|
762 |
|
---|
763 | for (int i = 0; i < numAttributes(); i++) {
|
---|
764 | if (i == att) {
|
---|
765 | newVec.addElement(newAtt);
|
---|
766 | } else {
|
---|
767 | newVec.addElement(attribute(i));
|
---|
768 | }
|
---|
769 | }
|
---|
770 | m_Attributes = newVec;
|
---|
771 | }
|
---|
772 |
|
---|
773 | /**
|
---|
774 | * Renames an attribute. This change only affects this
|
---|
775 | * dataset.
|
---|
776 | *
|
---|
777 | * @param att the attribute
|
---|
778 | * @param name the new name
|
---|
779 | */
|
---|
780 | public final void renameAttribute(Attribute att, String name) {
|
---|
781 |
|
---|
782 | renameAttribute(att.index(), name);
|
---|
783 | }
|
---|
784 |
|
---|
785 | /**
|
---|
786 | * Renames the value of a nominal (or string) attribute value. This
|
---|
787 | * change only affects this dataset.
|
---|
788 | *
|
---|
789 | * @param att the attribute's index
|
---|
790 | * @param val the value's index
|
---|
791 | * @param name the new name
|
---|
792 | */
|
---|
793 | public final void renameAttributeValue(int att, int val, String name) {
|
---|
794 |
|
---|
795 | Attribute newAtt = (Attribute)attribute(att).copy();
|
---|
796 | FastVector newVec = new FastVector(numAttributes());
|
---|
797 |
|
---|
798 | newAtt.setValue(val, name);
|
---|
799 | for (int i = 0; i < numAttributes(); i++) {
|
---|
800 | if (i == att) {
|
---|
801 | newVec.addElement(newAtt);
|
---|
802 | } else {
|
---|
803 | newVec.addElement(attribute(i));
|
---|
804 | }
|
---|
805 | }
|
---|
806 | m_Attributes = newVec;
|
---|
807 | }
|
---|
808 |
|
---|
809 | /**
|
---|
810 | * Renames the value of a nominal (or string) attribute value. This
|
---|
811 | * change only affects this dataset.
|
---|
812 | *
|
---|
813 | * @param att the attribute
|
---|
814 | * @param val the value
|
---|
815 | * @param name the new name
|
---|
816 | */
|
---|
817 | public final void renameAttributeValue(Attribute att, String val,
|
---|
818 | String name) {
|
---|
819 |
|
---|
820 | renameAttributeValue(att.index(), att.indexOfValue(val), name);
|
---|
821 | }
|
---|
822 |
|
---|
823 | /**
|
---|
824 | * Creates a new dataset of the same size using random sampling
|
---|
825 | * with replacement.
|
---|
826 | *
|
---|
827 | * @param random a random number generator
|
---|
828 | * @return the new dataset
|
---|
829 | */
|
---|
830 | public final Instances resample(Random random) {
|
---|
831 |
|
---|
832 | Instances newData = new Instances(this, numInstances());
|
---|
833 | while (newData.numInstances() < numInstances()) {
|
---|
834 | int i = (int) (random.nextDouble() * (double) numInstances());
|
---|
835 | newData.add(instance(i));
|
---|
836 | }
|
---|
837 | return newData;
|
---|
838 | }
|
---|
839 |
|
---|
840 | /**
|
---|
841 | * Creates a new dataset of the same size using random sampling
|
---|
842 | * with replacement according to the current instance weights. The
|
---|
843 | * weights of the instances in the new dataset are set to one.
|
---|
844 | *
|
---|
845 | * @param random a random number generator
|
---|
846 | * @return the new dataset
|
---|
847 | */
|
---|
848 | public final Instances resampleWithWeights(Random random) {
|
---|
849 |
|
---|
850 | double [] weights = new double[numInstances()];
|
---|
851 | boolean foundOne = false;
|
---|
852 | for (int i = 0; i < weights.length; i++) {
|
---|
853 | weights[i] = instance(i).weight();
|
---|
854 | if (!Utils.eq(weights[i], weights[0])) {
|
---|
855 | foundOne = true;
|
---|
856 | }
|
---|
857 | }
|
---|
858 | if (foundOne) {
|
---|
859 | return resampleWithWeights(random, weights);
|
---|
860 | } else {
|
---|
861 | return new Instances(this);
|
---|
862 | }
|
---|
863 | }
|
---|
864 |
|
---|
865 |
|
---|
866 | /**
|
---|
867 | * Creates a new dataset of the same size using random sampling
|
---|
868 | * with replacement according to the given weight vector. The
|
---|
869 | * weights of the instances in the new dataset are set to one.
|
---|
870 | * The length of the weight vector has to be the same as the
|
---|
871 | * number of instances in the dataset, and all weights have to
|
---|
872 | * be positive.
|
---|
873 | *
|
---|
874 | * @param random a random number generator
|
---|
875 | * @param weights the weight vector
|
---|
876 | * @return the new dataset
|
---|
877 | * @exception IllegalArgumentException if the weights array is of the wrong
|
---|
878 | * length or contains negative weights.
|
---|
879 | */
|
---|
880 | public final Instances resampleWithWeights(Random random,
|
---|
881 | double[] weights) {
|
---|
882 |
|
---|
883 | if (weights.length != numInstances()) {
|
---|
884 | throw new IllegalArgumentException("weights.length != numInstances.");
|
---|
885 | }
|
---|
886 | Instances newData = new Instances(this, numInstances());
|
---|
887 | double[] probabilities = new double[numInstances()];
|
---|
888 | double sumProbs = 0, sumOfWeights = Utils.sum(weights);
|
---|
889 | for (int i = 0; i < numInstances(); i++) {
|
---|
890 | sumProbs += random.nextDouble();
|
---|
891 | probabilities[i] = sumProbs;
|
---|
892 | }
|
---|
893 | Utils.normalize(probabilities, sumProbs / sumOfWeights);
|
---|
894 |
|
---|
895 | // Make sure that rounding errors don't mess things up
|
---|
896 | probabilities[numInstances() - 1] = sumOfWeights;
|
---|
897 | int k = 0; int l = 0;
|
---|
898 | sumProbs = 0;
|
---|
899 | while ((k < numInstances() && (l < numInstances()))) {
|
---|
900 | if (weights[l] < 0) {
|
---|
901 | throw new IllegalArgumentException("Weights have to be positive.");
|
---|
902 | }
|
---|
903 | sumProbs += weights[l];
|
---|
904 | while ((k < numInstances()) &&
|
---|
905 | (probabilities[k] <= sumProbs)) {
|
---|
906 | newData.add(instance(l));
|
---|
907 | newData.instance(k).setWeight(1);
|
---|
908 | k++;
|
---|
909 | }
|
---|
910 | l++;
|
---|
911 | }
|
---|
912 | return newData;
|
---|
913 | }
|
---|
914 |
|
---|
915 | /**
|
---|
916 | * Sets the class attribute.
|
---|
917 | *
|
---|
918 | * @param att attribute to be the class
|
---|
919 | */
|
---|
920 | public final void setClass(Attribute att) {
|
---|
921 |
|
---|
922 | m_ClassIndex = att.index();
|
---|
923 | }
|
---|
924 |
|
---|
925 | /**
|
---|
926 | * Sets the class index of the set.
|
---|
927 | * If the class index is negative there is assumed to be no class.
|
---|
928 | * (ie. it is undefined)
|
---|
929 | *
|
---|
930 | * @param classIndex the new class index
|
---|
931 | * @exception IllegalArgumentException if the class index is too big or < 0
|
---|
932 | */
|
---|
933 | public final void setClassIndex(int classIndex) {
|
---|
934 |
|
---|
935 | if (classIndex >= numAttributes()) {
|
---|
936 | throw new IllegalArgumentException("Invalid class index: " + classIndex);
|
---|
937 | }
|
---|
938 | m_ClassIndex = classIndex;
|
---|
939 | }
|
---|
940 |
|
---|
941 | /**
|
---|
942 | * Sets the relation's name.
|
---|
943 | *
|
---|
944 | * @param newName the new relation name.
|
---|
945 | */
|
---|
946 | public final void setRelationName(String newName) {
|
---|
947 |
|
---|
948 | m_RelationName = newName;
|
---|
949 | }
|
---|
950 |
|
---|
951 | /**
|
---|
952 | * Sorts the instances based on an attribute. For numeric attributes,
|
---|
953 | * instances are sorted in ascending order. For nominal attributes,
|
---|
954 | * instances are sorted based on the attribute label ordering
|
---|
955 | * specified in the header. Instances with missing values for the
|
---|
956 | * attribute are placed at the end of the dataset.
|
---|
957 | *
|
---|
958 | * @param attIndex the attribute's index
|
---|
959 | */
|
---|
960 | public final void sort(int attIndex) {
|
---|
961 |
|
---|
962 | int i,j;
|
---|
963 |
|
---|
964 | // move all instances with missing values to end
|
---|
965 | j = numInstances() - 1;
|
---|
966 | i = 0;
|
---|
967 | while (i <= j) {
|
---|
968 | if (instance(j).isMissing(attIndex)) {
|
---|
969 | j--;
|
---|
970 | } else {
|
---|
971 | if (instance(i).isMissing(attIndex)) {
|
---|
972 | swap(i,j);
|
---|
973 | j--;
|
---|
974 | }
|
---|
975 | i++;
|
---|
976 | }
|
---|
977 | }
|
---|
978 | quickSort(attIndex, 0, j);
|
---|
979 | }
|
---|
980 |
|
---|
981 | /**
|
---|
982 | * Sorts the instances based on an attribute. For numeric attributes,
|
---|
983 | * instances are sorted into ascending order. For nominal attributes,
|
---|
984 | * instances are sorted based on the attribute label ordering
|
---|
985 | * specified in the header. Instances with missing values for the
|
---|
986 | * attribute are placed at the end of the dataset.
|
---|
987 | *
|
---|
988 | * @param att the attribute
|
---|
989 | */
|
---|
990 | public final void sort(Attribute att) {
|
---|
991 |
|
---|
992 | sort(att.index());
|
---|
993 | }
|
---|
994 |
|
---|
995 | /**
|
---|
996 | * Stratifies a set of instances according to its class values
|
---|
997 | * if the class attribute is nominal (so that afterwards a
|
---|
998 | * stratified cross-validation can be performed).
|
---|
999 | *
|
---|
1000 | * @param numFolds the number of folds in the cross-validation
|
---|
1001 | * @exception UnassignedClassException if the class is not set
|
---|
1002 | */
|
---|
1003 | public final void stratify(int numFolds) {
|
---|
1004 |
|
---|
1005 | if (numFolds <= 0) {
|
---|
1006 | throw new IllegalArgumentException("Number of folds must be greater than 1");
|
---|
1007 | }
|
---|
1008 | if (m_ClassIndex < 0) {
|
---|
1009 | throw new UnassignedClassException("Class index is negative (not set)!");
|
---|
1010 | }
|
---|
1011 | if (classAttribute().isNominal()) {
|
---|
1012 |
|
---|
1013 | // sort by class
|
---|
1014 | int index = 1;
|
---|
1015 | while (index < numInstances()) {
|
---|
1016 | Instance instance1 = instance(index - 1);
|
---|
1017 | for (int j = index; j < numInstances(); j++) {
|
---|
1018 | Instance instance2 = instance(j);
|
---|
1019 | if ((instance1.classValue() == instance2.classValue()) ||
|
---|
1020 | (instance1.classIsMissing() &&
|
---|
1021 | instance2.classIsMissing())) {
|
---|
1022 | swap(index,j);
|
---|
1023 | index++;
|
---|
1024 | }
|
---|
1025 | }
|
---|
1026 | index++;
|
---|
1027 | }
|
---|
1028 | stratStep(numFolds);
|
---|
1029 | }
|
---|
1030 | }
|
---|
1031 |
|
---|
1032 | /**
|
---|
1033 | * Computes the sum of all the instances' weights.
|
---|
1034 | *
|
---|
1035 | * @return the sum of all the instances' weights as a double
|
---|
1036 | */
|
---|
1037 | public final double sumOfWeights() {
|
---|
1038 |
|
---|
1039 | double sum = 0;
|
---|
1040 |
|
---|
1041 | for (int i = 0; i < numInstances(); i++) {
|
---|
1042 | sum += instance(i).weight();
|
---|
1043 | }
|
---|
1044 | return sum;
|
---|
1045 | }
|
---|
1046 |
|
---|
1047 | /**
|
---|
1048 | * Creates the test set for one fold of a cross-validation on
|
---|
1049 | * the dataset.
|
---|
1050 | *
|
---|
1051 | * @param numFolds the number of folds in the cross-validation. Must
|
---|
1052 | * be greater than 1.
|
---|
1053 | * @param numFold 0 for the first fold, 1 for the second, ...
|
---|
1054 | * @return the test set as a set of weighted instances
|
---|
1055 | * @exception IllegalArgumentException if the number of folds is less than 2
|
---|
1056 | * or greater than the number of instances.
|
---|
1057 | */
|
---|
1058 | public Instances testCV(int numFolds, int numFold) {
|
---|
1059 |
|
---|
1060 | int numInstForFold, first, offset;
|
---|
1061 | Instances test;
|
---|
1062 |
|
---|
1063 | if (numFolds < 2) {
|
---|
1064 | throw new IllegalArgumentException("Number of folds must be at least 2!");
|
---|
1065 | }
|
---|
1066 | if (numFolds > numInstances()) {
|
---|
1067 | throw new IllegalArgumentException("Can't have more folds than instances!");
|
---|
1068 | }
|
---|
1069 | numInstForFold = numInstances() / numFolds;
|
---|
1070 | if (numFold < numInstances() % numFolds){
|
---|
1071 | numInstForFold++;
|
---|
1072 | offset = numFold;
|
---|
1073 | }else
|
---|
1074 | offset = numInstances() % numFolds;
|
---|
1075 | test = new Instances(this, numInstForFold);
|
---|
1076 | first = numFold * (numInstances() / numFolds) + offset;
|
---|
1077 | copyInstances(first, test, numInstForFold);
|
---|
1078 | return test;
|
---|
1079 | }
|
---|
1080 |
|
---|
1081 | /**
|
---|
1082 | * Returns the dataset as a string in ARFF format. Strings
|
---|
1083 | * are quoted if they contain whitespace characters, or if they
|
---|
1084 | * are a question mark.
|
---|
1085 | *
|
---|
1086 | * @return the dataset in ARFF format as a string
|
---|
1087 | */
|
---|
1088 | public final String toString() {
|
---|
1089 |
|
---|
1090 | StringBuffer text = new StringBuffer();
|
---|
1091 |
|
---|
1092 | text.append("@relation " + Utils.quote(m_RelationName) + "\n\n");
|
---|
1093 | for (int i = 0; i < numAttributes(); i++) {
|
---|
1094 | text.append(attribute(i) + "\n");
|
---|
1095 | }
|
---|
1096 | text.append("\n@data\n");
|
---|
1097 | for (int i = 0; i < numInstances(); i++) {
|
---|
1098 | text.append(instance(i));
|
---|
1099 | if (i < numInstances() - 1) {
|
---|
1100 | text.append('\n');
|
---|
1101 | }
|
---|
1102 | }
|
---|
1103 | return text.toString();
|
---|
1104 | }
|
---|
1105 |
|
---|
1106 | /**
|
---|
1107 | * Creates the training set for one fold of a cross-validation
|
---|
1108 | * on the dataset.
|
---|
1109 | *
|
---|
1110 | * @param numFolds the number of folds in the cross-validation. Must
|
---|
1111 | * be greater than 1.
|
---|
1112 | * @param numFold 0 for the first fold, 1 for the second, ...
|
---|
1113 | * @return the training set as a set of weighted
|
---|
1114 | * instances
|
---|
1115 | * @exception IllegalArgumentException if the number of folds is less than 2
|
---|
1116 | * or greater than the number of instances.
|
---|
1117 | */
|
---|
1118 | public Instances trainCV(int numFolds, int numFold) {
|
---|
1119 |
|
---|
1120 | int numInstForFold, first, offset;
|
---|
1121 | Instances train;
|
---|
1122 |
|
---|
1123 | if (numFolds < 2) {
|
---|
1124 | throw new IllegalArgumentException("Number of folds must be at least 2!");
|
---|
1125 | }
|
---|
1126 | if (numFolds > numInstances()) {
|
---|
1127 | throw new IllegalArgumentException("Can't have more folds than instances!");
|
---|
1128 | }
|
---|
1129 | numInstForFold = numInstances() / numFolds;
|
---|
1130 | if (numFold < numInstances() % numFolds) {
|
---|
1131 | numInstForFold++;
|
---|
1132 | offset = numFold;
|
---|
1133 | }else
|
---|
1134 | offset = numInstances() % numFolds;
|
---|
1135 | train = new Instances(this, numInstances() - numInstForFold);
|
---|
1136 | first = numFold * (numInstances() / numFolds) + offset;
|
---|
1137 | copyInstances(0, train, first);
|
---|
1138 | copyInstances(first + numInstForFold, train,
|
---|
1139 | numInstances() - first - numInstForFold);
|
---|
1140 |
|
---|
1141 | return train;
|
---|
1142 | }
|
---|
1143 |
|
---|
1144 | /**
|
---|
1145 | * Computes the variance for a numeric attribute.
|
---|
1146 | *
|
---|
1147 | * @param attIndex the numeric attribute
|
---|
1148 | * @return the variance if the attribute is numeric
|
---|
1149 | * @exception IllegalArgumentException if the attribute is not numeric
|
---|
1150 | */
|
---|
1151 | public final double variance(int attIndex) {
|
---|
1152 |
|
---|
1153 | double sum = 0, sumSquared = 0, sumOfWeights = 0;
|
---|
1154 |
|
---|
1155 | if (!attribute(attIndex).isNumeric()) {
|
---|
1156 | throw new IllegalArgumentException("Can't compute variance because attribute is " +
|
---|
1157 | "not numeric!");
|
---|
1158 | }
|
---|
1159 | for (int i = 0; i < numInstances(); i++) {
|
---|
1160 | if (!instance(i).isMissing(attIndex)) {
|
---|
1161 | sum += instance(i).weight() *
|
---|
1162 | instance(i).value(attIndex);
|
---|
1163 | sumSquared += instance(i).weight() *
|
---|
1164 | instance(i).value(attIndex) *
|
---|
1165 | instance(i).value(attIndex);
|
---|
1166 | sumOfWeights += instance(i).weight();
|
---|
1167 | }
|
---|
1168 | }
|
---|
1169 | if (Utils.smOrEq(sumOfWeights, 1)) {
|
---|
1170 | return 0;
|
---|
1171 | }
|
---|
1172 | return (sumSquared - (sum * sum / sumOfWeights)) /
|
---|
1173 | (sumOfWeights - 1);
|
---|
1174 | }
|
---|
1175 |
|
---|
1176 | /**
|
---|
1177 | * Computes the variance for a numeric attribute.
|
---|
1178 | *
|
---|
1179 | * @param att the numeric attribute
|
---|
1180 | * @return the variance if the attribute is numeric
|
---|
1181 | * @exception IllegalArgumentException if the attribute is not numeric
|
---|
1182 | */
|
---|
1183 | public final double variance(Attribute att) {
|
---|
1184 |
|
---|
1185 | return variance(att.index());
|
---|
1186 | }
|
---|
1187 |
|
---|
1188 | /**
|
---|
1189 | * Calculates summary statistics on the values that appear in this
|
---|
1190 | * set of instances for a specified attribute.
|
---|
1191 | *
|
---|
1192 | * @param index the index of the attribute to summarize.
|
---|
1193 | * @return an AttributeStats object with it's fields calculated.
|
---|
1194 | */
|
---|
1195 | public AttributeStats attributeStats(int index) {
|
---|
1196 |
|
---|
1197 | AttributeStats result = new AttributeStats();
|
---|
1198 | if (attribute(index).isNominal()) {
|
---|
1199 | result.nominalCounts = new int [attribute(index).numValues()];
|
---|
1200 | }
|
---|
1201 | if (attribute(index).isNumeric()) {
|
---|
1202 | result.numericStats = new weka.experiment.Stats();
|
---|
1203 | }
|
---|
1204 | result.totalCount = numInstances();
|
---|
1205 |
|
---|
1206 | double [] attVals = attributeToDoubleArray(index);
|
---|
1207 | int [] sorted = Utils.sort(attVals);
|
---|
1208 | int currentCount = 0;
|
---|
1209 | double prev = Instance.missingValue();
|
---|
1210 | for (int j = 0; j < numInstances(); j++) {
|
---|
1211 | Instance current = instance(sorted[j]);
|
---|
1212 | if (current.isMissing(index)) {
|
---|
1213 | result.missingCount = numInstances() - j;
|
---|
1214 | break;
|
---|
1215 | }
|
---|
1216 | if (Utils.eq(current.value(index), prev)) {
|
---|
1217 | currentCount++;
|
---|
1218 | } else {
|
---|
1219 | result.addDistinct(prev, currentCount);
|
---|
1220 | currentCount = 1;
|
---|
1221 | prev = current.value(index);
|
---|
1222 | }
|
---|
1223 | }
|
---|
1224 | result.addDistinct(prev, currentCount);
|
---|
1225 | result.distinctCount--; // So we don't count "missing" as a value
|
---|
1226 | return result;
|
---|
1227 | }
|
---|
1228 |
|
---|
1229 | /**
|
---|
1230 | * Gets the value of all instances in this dataset for a particular
|
---|
1231 | * attribute. Useful in conjunction with Utils.sort to allow iterating
|
---|
1232 | * through the dataset in sorted order for some attribute.
|
---|
1233 | *
|
---|
1234 | * @param index the index of the attribute.
|
---|
1235 | * @return an array containing the value of the desired attribute for
|
---|
1236 | * each instance in the dataset.
|
---|
1237 | */
|
---|
1238 | public double [] attributeToDoubleArray(int index) {
|
---|
1239 |
|
---|
1240 | double [] result = new double[numInstances()];
|
---|
1241 | for (int i = 0; i < result.length; i++) {
|
---|
1242 | result[i] = instance(i).value(index);
|
---|
1243 | }
|
---|
1244 | return result;
|
---|
1245 | }
|
---|
1246 |
|
---|
1247 | /**
|
---|
1248 | * Generates a string summarizing the set of instances. Gives a breakdown
|
---|
1249 | * for each attribute indicating the number of missing/discrete/unique
|
---|
1250 | * values and other information.
|
---|
1251 | *
|
---|
1252 | * @return a string summarizing the dataset
|
---|
1253 | */
|
---|
1254 | public String toSummaryString() {
|
---|
1255 |
|
---|
1256 | StringBuffer result = new StringBuffer();
|
---|
1257 | result.append("Relation Name: ").append(relationName()).append('\n');
|
---|
1258 | result.append("Num Instances: ").append(numInstances()).append('\n');
|
---|
1259 | result.append("Num Attributes: ").append(numAttributes()).append('\n');
|
---|
1260 | result.append('\n');
|
---|
1261 |
|
---|
1262 | result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
|
---|
1263 | result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
|
---|
1264 | result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
|
---|
1265 | result.append(Utils.padLeft("Missing", 12));
|
---|
1266 | result.append(Utils.padLeft("Unique", 12));
|
---|
1267 | result.append(Utils.padLeft("Dist", 6)).append('\n');
|
---|
1268 | for (int i = 0; i < numAttributes(); i++) {
|
---|
1269 | Attribute a = attribute(i);
|
---|
1270 | AttributeStats as = attributeStats(i);
|
---|
1271 | result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
|
---|
1272 | result.append(Utils.padRight(a.name(), 25)).append(' ');
|
---|
1273 | long percent;
|
---|
1274 | switch (a.type()) {
|
---|
1275 | case Attribute.NOMINAL:
|
---|
1276 | result.append(Utils.padLeft("Nom", 4)).append(' ');
|
---|
1277 | percent = Math.round(100.0 * as.intCount / as.totalCount);
|
---|
1278 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1279 | result.append(Utils.padLeft("" + 0, 3)).append("% ");
|
---|
1280 | percent = Math.round(100.0 * as.realCount / as.totalCount);
|
---|
1281 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1282 | break;
|
---|
1283 | case Attribute.NUMERIC:
|
---|
1284 | result.append(Utils.padLeft("Num", 4)).append(' ');
|
---|
1285 | result.append(Utils.padLeft("" + 0, 3)).append("% ");
|
---|
1286 | percent = Math.round(100.0 * as.intCount / as.totalCount);
|
---|
1287 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1288 | percent = Math.round(100.0 * as.realCount / as.totalCount);
|
---|
1289 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1290 | break;
|
---|
1291 | case Attribute.STRING:
|
---|
1292 | result.append(Utils.padLeft("Str", 4)).append(' ');
|
---|
1293 | percent = Math.round(100.0 * as.intCount / as.totalCount);
|
---|
1294 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1295 | result.append(Utils.padLeft("" + 0, 3)).append("% ");
|
---|
1296 | percent = Math.round(100.0 * as.realCount / as.totalCount);
|
---|
1297 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1298 | break;
|
---|
1299 | default:
|
---|
1300 | result.append(Utils.padLeft("???", 4)).append(' ');
|
---|
1301 | result.append(Utils.padLeft("" + 0, 3)).append("% ");
|
---|
1302 | percent = Math.round(100.0 * as.intCount / as.totalCount);
|
---|
1303 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1304 | percent = Math.round(100.0 * as.realCount / as.totalCount);
|
---|
1305 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1306 | break;
|
---|
1307 | }
|
---|
1308 | result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
|
---|
1309 | percent = Math.round(100.0 * as.missingCount / as.totalCount);
|
---|
1310 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1311 | result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
|
---|
1312 | percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
|
---|
1313 | result.append(Utils.padLeft("" + percent, 3)).append("% ");
|
---|
1314 | result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
|
---|
1315 | result.append('\n');
|
---|
1316 | }
|
---|
1317 | return result.toString();
|
---|
1318 | }
|
---|
1319 |
|
---|
1320 | /**
|
---|
1321 | * Reads a single instance using the tokenizer and appends it
|
---|
1322 | * to the dataset. Automatically expands the dataset if it
|
---|
1323 | * is not large enough to hold the instance.
|
---|
1324 | *
|
---|
1325 | * @param tokenizer the tokenizer to be used
|
---|
1326 | * @param flag if method should test for carriage return after
|
---|
1327 | * each instance
|
---|
1328 | * @return false if end of file has been reached
|
---|
1329 | * @exception IOException if the information is not read
|
---|
1330 | * successfully
|
---|
1331 | */
|
---|
1332 | protected boolean getInstance(StreamTokenizer tokenizer,
|
---|
1333 | boolean flag)
|
---|
1334 | throws IOException {
|
---|
1335 |
|
---|
1336 | // Check if any attributes have been declared.
|
---|
1337 | if (m_Attributes.size() == 0) {
|
---|
1338 | errms(tokenizer,"no header information available");
|
---|
1339 | }
|
---|
1340 |
|
---|
1341 | // Check if end of file reached.
|
---|
1342 | getFirstToken(tokenizer);
|
---|
1343 | if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
|
---|
1344 | return false;
|
---|
1345 | }
|
---|
1346 |
|
---|
1347 | // Parse instance
|
---|
1348 | if (tokenizer.ttype == '{') {
|
---|
1349 | return getInstanceSparse(tokenizer, flag);
|
---|
1350 | } else {
|
---|
1351 | return getInstanceFull(tokenizer, flag);
|
---|
1352 | }
|
---|
1353 | }
|
---|
1354 |
|
---|
1355 | /**
|
---|
1356 | * Reads a single instance using the tokenizer and appends it
|
---|
1357 | * to the dataset. Automatically expands the dataset if it
|
---|
1358 | * is not large enough to hold the instance.
|
---|
1359 | *
|
---|
1360 | * @param tokenizer the tokenizer to be used
|
---|
1361 | * @param flag if method should test for carriage return after
|
---|
1362 | * each instance
|
---|
1363 | * @return false if end of file has been reached
|
---|
1364 | * @exception IOException if the information is not read
|
---|
1365 | * successfully
|
---|
1366 | */
|
---|
1367 | protected boolean getInstanceSparse(StreamTokenizer tokenizer,
|
---|
1368 | boolean flag)
|
---|
1369 | throws IOException {
|
---|
1370 |
|
---|
1371 | int valIndex, numValues = 0, maxIndex = -1;
|
---|
1372 |
|
---|
1373 | // Get values
|
---|
1374 | do {
|
---|
1375 |
|
---|
1376 | // Get index
|
---|
1377 | getIndex(tokenizer);
|
---|
1378 | if (tokenizer.ttype == '}') {
|
---|
1379 | break;
|
---|
1380 | }
|
---|
1381 |
|
---|
1382 | // Is index valid?
|
---|
1383 | try{
|
---|
1384 | m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
|
---|
1385 | } catch (NumberFormatException e) {
|
---|
1386 | errms(tokenizer,"index number expected");
|
---|
1387 | }
|
---|
1388 | if (m_IndicesBuffer[numValues] <= maxIndex) {
|
---|
1389 | errms(tokenizer,"indices have to be ordered");
|
---|
1390 | }
|
---|
1391 | if ((m_IndicesBuffer[numValues] < 0) ||
|
---|
1392 | (m_IndicesBuffer[numValues] >= numAttributes())) {
|
---|
1393 | errms(tokenizer,"index out of bounds");
|
---|
1394 | }
|
---|
1395 | maxIndex = m_IndicesBuffer[numValues];
|
---|
1396 |
|
---|
1397 | // Get value;
|
---|
1398 | getNextToken(tokenizer);
|
---|
1399 |
|
---|
1400 | // Check if value is missing.
|
---|
1401 | if (tokenizer.ttype == '?') {
|
---|
1402 | m_ValueBuffer[numValues] = Instance.missingValue();
|
---|
1403 | } else {
|
---|
1404 |
|
---|
1405 | // Check if token is valid.
|
---|
1406 | if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
|
---|
1407 | errms(tokenizer,"not a valid value");
|
---|
1408 | }
|
---|
1409 | if (attribute(m_IndicesBuffer[numValues]).isNominal()) {
|
---|
1410 |
|
---|
1411 | // Check if value appears in header.
|
---|
1412 | valIndex =
|
---|
1413 | attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
|
---|
1414 | if (valIndex == -1) {
|
---|
1415 | errms(tokenizer,"nominal value not declared in header");
|
---|
1416 | }
|
---|
1417 | m_ValueBuffer[numValues] = (double)valIndex;
|
---|
1418 | } else if (attribute(m_IndicesBuffer[numValues]).isNumeric()) {
|
---|
1419 |
|
---|
1420 | // Check if value is really a number.
|
---|
1421 | try{
|
---|
1422 | m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
|
---|
1423 | doubleValue();
|
---|
1424 | } catch (NumberFormatException e) {
|
---|
1425 | errms(tokenizer,"number expected");
|
---|
1426 | }
|
---|
1427 | } else {
|
---|
1428 | m_ValueBuffer[numValues] =
|
---|
1429 | attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
|
---|
1430 | }
|
---|
1431 | }
|
---|
1432 | numValues++;
|
---|
1433 | } while (true);
|
---|
1434 | if (flag) {
|
---|
1435 | getLastToken(tokenizer,true);
|
---|
1436 | }
|
---|
1437 |
|
---|
1438 | // Add instance to dataset
|
---|
1439 | double[] tempValues = new double[numValues];
|
---|
1440 | int[] tempIndices = new int[numValues];
|
---|
1441 | System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
|
---|
1442 | System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
|
---|
1443 | add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
|
---|
1444 | return true;
|
---|
1445 | }
|
---|
1446 |
|
---|
1447 | /**
|
---|
1448 | * Reads a single instance using the tokenizer and appends it
|
---|
1449 | * to the dataset. Automatically expands the dataset if it
|
---|
1450 | * is not large enough to hold the instance.
|
---|
1451 | *
|
---|
1452 | * @param tokenizer the tokenizer to be used
|
---|
1453 | * @param flag if method should test for carriage return after
|
---|
1454 | * each instance
|
---|
1455 | * @return false if end of file has been reached
|
---|
1456 | * @exception IOException if the information is not read
|
---|
1457 | * successfully
|
---|
1458 | */
|
---|
1459 | protected boolean getInstanceFull(StreamTokenizer tokenizer,
|
---|
1460 | boolean flag)
|
---|
1461 | throws IOException {
|
---|
1462 |
|
---|
1463 | double[] instance = new double[numAttributes()];
|
---|
1464 | int index;
|
---|
1465 |
|
---|
1466 | // Get values for all attributes.
|
---|
1467 | for (int i = 0; i < numAttributes(); i++){
|
---|
1468 |
|
---|
1469 | // Get next token
|
---|
1470 | if (i > 0) {
|
---|
1471 | getNextToken(tokenizer);
|
---|
1472 | }
|
---|
1473 |
|
---|
1474 | // Check if value is missing.
|
---|
1475 | if (tokenizer.ttype == '?') {
|
---|
1476 | instance[i] = Instance.missingValue();
|
---|
1477 | } else {
|
---|
1478 |
|
---|
1479 | // Check if token is valid.
|
---|
1480 | if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
|
---|
1481 | errms(tokenizer,"not a valid value");
|
---|
1482 | }
|
---|
1483 | if (attribute(i).isNominal()) {
|
---|
1484 |
|
---|
1485 | // Check if value appears in header.
|
---|
1486 | index = attribute(i).indexOfValue(tokenizer.sval);
|
---|
1487 | if (index == -1) {
|
---|
1488 | errms(tokenizer,"nominal value not declared in header");
|
---|
1489 | }
|
---|
1490 | instance[i] = (double)index;
|
---|
1491 | } else if (attribute(i).isNumeric()) {
|
---|
1492 |
|
---|
1493 | // Check if value is really a number.
|
---|
1494 | try{
|
---|
1495 | instance[i] = Double.valueOf(tokenizer.sval).
|
---|
1496 | doubleValue();
|
---|
1497 | } catch (NumberFormatException e) {
|
---|
1498 | errms(tokenizer,"number expected");
|
---|
1499 | }
|
---|
1500 | } else {
|
---|
1501 | instance[i] = attribute(i).addStringValue(tokenizer.sval);
|
---|
1502 | }
|
---|
1503 | }
|
---|
1504 | }
|
---|
1505 | if (flag) {
|
---|
1506 | getLastToken(tokenizer,true);
|
---|
1507 | }
|
---|
1508 |
|
---|
1509 | // Add instance to dataset
|
---|
1510 | add(new Instance(1, instance));
|
---|
1511 | return true;
|
---|
1512 | }
|
---|
1513 |
|
---|
1514 | /**
|
---|
1515 | * Reads and stores header of an ARFF file.
|
---|
1516 | *
|
---|
1517 | * @param tokenizer the stream tokenizer
|
---|
1518 | * @exception IOException if the information is not read
|
---|
1519 | * successfully
|
---|
1520 | */
|
---|
1521 | protected void readHeader(StreamTokenizer tokenizer)
|
---|
1522 | throws IOException{
|
---|
1523 |
|
---|
1524 | String attributeName;
|
---|
1525 | FastVector attributeValues;
|
---|
1526 | int i;
|
---|
1527 |
|
---|
1528 | // Get name of relation.
|
---|
1529 | getFirstToken(tokenizer);
|
---|
1530 | if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
|
---|
1531 | errms(tokenizer,"premature end of file");
|
---|
1532 | }
|
---|
1533 | if (tokenizer.sval.equalsIgnoreCase("@relation")){
|
---|
1534 | getNextToken(tokenizer);
|
---|
1535 | m_RelationName = tokenizer.sval;
|
---|
1536 | getLastToken(tokenizer,false);
|
---|
1537 | } else {
|
---|
1538 | errms(tokenizer,"keyword @relation expected");
|
---|
1539 | }
|
---|
1540 |
|
---|
1541 | // Create vectors to hold information temporarily.
|
---|
1542 | m_Attributes = new FastVector();
|
---|
1543 |
|
---|
1544 | // Get attribute declarations.
|
---|
1545 | getFirstToken(tokenizer);
|
---|
1546 | if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
|
---|
1547 | errms(tokenizer,"premature end of file");
|
---|
1548 | }
|
---|
1549 | while (tokenizer.sval.equalsIgnoreCase("@attribute")) {
|
---|
1550 |
|
---|
1551 | // Get attribute name.
|
---|
1552 | getNextToken(tokenizer);
|
---|
1553 | attributeName = tokenizer.sval;
|
---|
1554 | getNextToken(tokenizer);
|
---|
1555 |
|
---|
1556 | // Check if attribute is nominal.
|
---|
1557 | if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
|
---|
1558 |
|
---|
1559 | // Attribute is real, integer, or string.
|
---|
1560 | if (tokenizer.sval.equalsIgnoreCase("real") ||
|
---|
1561 | tokenizer.sval.equalsIgnoreCase("integer") ||
|
---|
1562 | tokenizer.sval.equalsIgnoreCase("numeric")) {
|
---|
1563 | m_Attributes.addElement(new Attribute(attributeName,
|
---|
1564 | numAttributes()));
|
---|
1565 | readTillEOL(tokenizer);
|
---|
1566 | } else if (tokenizer.sval.equalsIgnoreCase("string")) {
|
---|
1567 | m_Attributes.
|
---|
1568 | addElement(new Attribute(attributeName, null,
|
---|
1569 | numAttributes()));
|
---|
1570 | readTillEOL(tokenizer);
|
---|
1571 | } else {
|
---|
1572 | errms(tokenizer,"no valid attribute type or invalid "+
|
---|
1573 | "enumeration");
|
---|
1574 | }
|
---|
1575 | } else {
|
---|
1576 |
|
---|
1577 | // Attribute is nominal.
|
---|
1578 | attributeValues = new FastVector();
|
---|
1579 | tokenizer.pushBack();
|
---|
1580 |
|
---|
1581 | // Get values for nominal attribute.
|
---|
1582 | if (tokenizer.nextToken() != '{') {
|
---|
1583 | errms(tokenizer,"{ expected at beginning of enumeration");
|
---|
1584 | }
|
---|
1585 | while (tokenizer.nextToken() != '}') {
|
---|
1586 | if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
|
---|
1587 | errms(tokenizer,"} expected at end of enumeration");
|
---|
1588 | } else {
|
---|
1589 | attributeValues.addElement(tokenizer.sval);
|
---|
1590 | }
|
---|
1591 | }
|
---|
1592 | if (attributeValues.size() == 0) {
|
---|
1593 | errms(tokenizer,"no nominal values found");
|
---|
1594 | }
|
---|
1595 | m_Attributes.
|
---|
1596 | addElement(new Attribute(attributeName, attributeValues,
|
---|
1597 | numAttributes()));
|
---|
1598 | }
|
---|
1599 | getLastToken(tokenizer,false);
|
---|
1600 | getFirstToken(tokenizer);
|
---|
1601 | if (tokenizer.ttype == StreamTokenizer.TT_EOF)
|
---|
1602 | errms(tokenizer,"premature end of file");
|
---|
1603 | }
|
---|
1604 |
|
---|
1605 | // Check if data part follows. We can't easily check for EOL.
|
---|
1606 | if (!tokenizer.sval.equalsIgnoreCase("@data")) {
|
---|
1607 | errms(tokenizer,"keyword @data expected");
|
---|
1608 | }
|
---|
1609 |
|
---|
1610 | // Check if any attributes have been declared.
|
---|
1611 | if (m_Attributes.size() == 0) {
|
---|
1612 | errms(tokenizer,"no attributes declared");
|
---|
1613 | }
|
---|
1614 |
|
---|
1615 | // Allocate buffers in case sparse instances have to be read
|
---|
1616 | m_ValueBuffer = new double[numAttributes()];
|
---|
1617 | m_IndicesBuffer = new int[numAttributes()];
|
---|
1618 | }
|
---|
1619 |
|
---|
1620 | /**
|
---|
1621 | * Copies instances from one set to the end of another
|
---|
1622 | * one.
|
---|
1623 | *
|
---|
1624 | * @param source the source of the instances
|
---|
1625 | * @param from the position of the first instance to be copied
|
---|
1626 | * @param dest the destination for the instances
|
---|
1627 | * @param num the number of instances to be copied
|
---|
1628 | */
|
---|
1629 | private void copyInstances(int from, Instances dest, int num) {
|
---|
1630 |
|
---|
1631 | for (int i = 0; i < num; i++) {
|
---|
1632 | dest.add(instance(from + i));
|
---|
1633 | }
|
---|
1634 | }
|
---|
1635 |
|
---|
1636 | /**
|
---|
1637 | * Throws error message with line number and last token read.
|
---|
1638 | *
|
---|
1639 | * @param theMsg the error message to be thrown
|
---|
1640 | * @param tokenizer the stream tokenizer
|
---|
1641 | * @throws IOExcpetion containing the error message
|
---|
1642 | */
|
---|
1643 | private void errms(StreamTokenizer tokenizer, String theMsg)
|
---|
1644 | throws IOException {
|
---|
1645 |
|
---|
1646 | throw new IOException(theMsg + ", read " + tokenizer.toString());
|
---|
1647 | }
|
---|
1648 |
|
---|
1649 | /**
|
---|
1650 | * Replaces the attribute information by a clone of
|
---|
1651 | * itself.
|
---|
1652 | */
|
---|
1653 | private void freshAttributeInfo() {
|
---|
1654 |
|
---|
1655 | m_Attributes = (FastVector) m_Attributes.copyElements();
|
---|
1656 | }
|
---|
1657 |
|
---|
1658 | /**
|
---|
1659 | * Gets next token, skipping empty lines.
|
---|
1660 | *
|
---|
1661 | * @param tokenizer the stream tokenizer
|
---|
1662 | * @exception IOException if reading the next token fails
|
---|
1663 | */
|
---|
1664 | private void getFirstToken(StreamTokenizer tokenizer)
|
---|
1665 | throws IOException{
|
---|
1666 |
|
---|
1667 | while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};
|
---|
1668 | if ((tokenizer.ttype == '\'') ||
|
---|
1669 | (tokenizer.ttype == '"')) {
|
---|
1670 | tokenizer.ttype = StreamTokenizer.TT_WORD;
|
---|
1671 | } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
|
---|
1672 | (tokenizer.sval.equals("?"))){
|
---|
1673 | tokenizer.ttype = '?';
|
---|
1674 | }
|
---|
1675 | }
|
---|
1676 |
|
---|
1677 | /**
|
---|
1678 | * Gets index, checking for a premature and of line.
|
---|
1679 | *
|
---|
1680 | * @param tokenizer the stream tokenizer
|
---|
1681 | * @exception IOException if it finds a premature end of line
|
---|
1682 | */
|
---|
1683 | private void getIndex(StreamTokenizer tokenizer) throws IOException{
|
---|
1684 |
|
---|
1685 | if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
|
---|
1686 | errms(tokenizer,"premature end of line");
|
---|
1687 | }
|
---|
1688 | if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
|
---|
1689 | errms(tokenizer,"premature end of file");
|
---|
1690 | }
|
---|
1691 | }
|
---|
1692 |
|
---|
1693 | /**
|
---|
1694 | * Gets token and checks if its end of line.
|
---|
1695 | *
|
---|
1696 | * @param tokenizer the stream tokenizer
|
---|
1697 | * @exception IOException if it doesn't find an end of line
|
---|
1698 | */
|
---|
1699 | private void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)
|
---|
1700 | throws IOException{
|
---|
1701 |
|
---|
1702 | if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
|
---|
1703 | ((tokenizer.nextToken() != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
|
---|
1704 | errms(tokenizer,"end of line expected");
|
---|
1705 | }
|
---|
1706 | }
|
---|
1707 |
|
---|
1708 | /**
|
---|
1709 | * Gets next token, checking for a premature and of line.
|
---|
1710 | *
|
---|
1711 | * @param tokenizer the stream tokenizer
|
---|
1712 | * @exception IOException if it finds a premature end of line
|
---|
1713 | */
|
---|
1714 | private void getNextToken(StreamTokenizer tokenizer)
|
---|
1715 | throws IOException{
|
---|
1716 |
|
---|
1717 | if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
|
---|
1718 | errms(tokenizer,"premature end of line");
|
---|
1719 | }
|
---|
1720 | if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
|
---|
1721 | errms(tokenizer,"premature end of file");
|
---|
1722 | } else if ((tokenizer.ttype == '\'') ||
|
---|
1723 | (tokenizer.ttype == '"')) {
|
---|
1724 | tokenizer.ttype = StreamTokenizer.TT_WORD;
|
---|
1725 | } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
|
---|
1726 | (tokenizer.sval.equals("?"))){
|
---|
1727 | tokenizer.ttype = '?';
|
---|
1728 | }
|
---|
1729 | }
|
---|
1730 |
|
---|
1731 | /**
|
---|
1732 | * Initializes the StreamTokenizer used for reading the ARFF file.
|
---|
1733 | *
|
---|
1734 | * @param tokenizer the stream tokenizer
|
---|
1735 | */
|
---|
1736 | private void initTokenizer(StreamTokenizer tokenizer){
|
---|
1737 |
|
---|
1738 | tokenizer.resetSyntax();
|
---|
1739 | tokenizer.whitespaceChars(0, ' ');
|
---|
1740 | tokenizer.wordChars(' '+1,'\u00FF');
|
---|
1741 | tokenizer.whitespaceChars(',',',');
|
---|
1742 | tokenizer.commentChar('%');
|
---|
1743 | tokenizer.quoteChar('"');
|
---|
1744 | tokenizer.quoteChar('\'');
|
---|
1745 | tokenizer.ordinaryChar('{');
|
---|
1746 | tokenizer.ordinaryChar('}');
|
---|
1747 | tokenizer.eolIsSignificant(true);
|
---|
1748 | }
|
---|
1749 |
|
---|
1750 | /**
|
---|
1751 | * Returns string including all instances, their weights and
|
---|
1752 | * their indices in the original dataset.
|
---|
1753 | *
|
---|
1754 | * @return description of instance and its weight as a string
|
---|
1755 | */
|
---|
1756 | private String instancesAndWeights(){
|
---|
1757 |
|
---|
1758 | StringBuffer text = new StringBuffer();
|
---|
1759 |
|
---|
1760 | for (int i = 0; i < numInstances(); i++) {
|
---|
1761 | text.append(instance(i) + " " + instance(i).weight());
|
---|
1762 | if (i < numInstances() - 1) {
|
---|
1763 | text.append("\n");
|
---|
1764 | }
|
---|
1765 | }
|
---|
1766 | return text.toString();
|
---|
1767 | }
|
---|
1768 |
|
---|
1769 | /**
|
---|
1770 | * Implements quicksort.
|
---|
1771 | *
|
---|
1772 | * @param attIndex the attribute's index
|
---|
1773 | * @param lo0 the first index of the subset to be sorted
|
---|
1774 | * @param hi0 the last index of the subset to be sorted
|
---|
1775 | */
|
---|
1776 | private void quickSort(int attIndex, int lo0, int hi0) {
|
---|
1777 |
|
---|
1778 | int lo = lo0, hi = hi0;
|
---|
1779 | double mid, midPlus, midMinus;
|
---|
1780 |
|
---|
1781 | if (hi0 > lo0) {
|
---|
1782 |
|
---|
1783 | // Arbitrarily establishing partition element as the
|
---|
1784 | // midpoint of the array.
|
---|
1785 | mid = instance((lo0 + hi0) / 2).value(attIndex);
|
---|
1786 | midPlus = mid + 1e-6;
|
---|
1787 | midMinus = mid - 1e-6;
|
---|
1788 |
|
---|
1789 | // loop through the array until indices cross
|
---|
1790 | while(lo <= hi) {
|
---|
1791 |
|
---|
1792 | // find the first element that is greater than or equal to
|
---|
1793 | // the partition element starting from the left Index.
|
---|
1794 | while ((instance(lo).value(attIndex) <
|
---|
1795 | midMinus) && (lo < hi0)) {
|
---|
1796 | ++lo;
|
---|
1797 | }
|
---|
1798 |
|
---|
1799 | // find an element that is smaller than or equal to
|
---|
1800 | // the partition element starting from the right Index.
|
---|
1801 | while ((instance(hi).value(attIndex) >
|
---|
1802 | midPlus) && (hi > lo0)) {
|
---|
1803 | --hi;
|
---|
1804 | }
|
---|
1805 |
|
---|
1806 | // if the indexes have not crossed, swap
|
---|
1807 | if(lo <= hi) {
|
---|
1808 | swap(lo,hi);
|
---|
1809 | ++lo;
|
---|
1810 | --hi;
|
---|
1811 | }
|
---|
1812 | }
|
---|
1813 |
|
---|
1814 | // If the right index has not reached the left side of array
|
---|
1815 | // must now sort the left partition.
|
---|
1816 | if(lo0 < hi) {
|
---|
1817 | quickSort(attIndex,lo0,hi);
|
---|
1818 | }
|
---|
1819 |
|
---|
1820 | // If the left index has not reached the right side of array
|
---|
1821 | // must now sort the right partition.
|
---|
1822 | if(lo < hi0) {
|
---|
1823 | quickSort(attIndex,lo,hi0);
|
---|
1824 | }
|
---|
1825 | }
|
---|
1826 | }
|
---|
1827 |
|
---|
1828 | /**
|
---|
1829 | * Reads and skips all tokens before next end of line token.
|
---|
1830 | *
|
---|
1831 | * @param tokenizer the stream tokenizer
|
---|
1832 | */
|
---|
1833 | private void readTillEOL(StreamTokenizer tokenizer)
|
---|
1834 | throws IOException{
|
---|
1835 |
|
---|
1836 | while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
|
---|
1837 | tokenizer.pushBack();
|
---|
1838 | }
|
---|
1839 |
|
---|
1840 | /**
|
---|
1841 | * Help function needed for stratification of set.
|
---|
1842 | *
|
---|
1843 | * @param numFolds the number of folds for the stratification
|
---|
1844 | */
|
---|
1845 | private void stratStep (int numFolds){
|
---|
1846 |
|
---|
1847 | FastVector newVec = new FastVector(m_Instances.capacity());
|
---|
1848 | int start = 0, j;
|
---|
1849 |
|
---|
1850 | // create stratified batch
|
---|
1851 | while (newVec.size() < numInstances()) {
|
---|
1852 | j = start;
|
---|
1853 | while (j < numInstances()) {
|
---|
1854 | newVec.addElement(instance(j));
|
---|
1855 | j = j + numFolds;
|
---|
1856 | }
|
---|
1857 | start++;
|
---|
1858 | }
|
---|
1859 | m_Instances = newVec;
|
---|
1860 | }
|
---|
1861 |
|
---|
1862 | /**
|
---|
1863 | * Swaps two instances in the set.
|
---|
1864 | *
|
---|
1865 | * @param i the first instance's index
|
---|
1866 | * @param j the second instance's index
|
---|
1867 | */
|
---|
1868 | private void swap(int i, int j){
|
---|
1869 |
|
---|
1870 | m_Instances.swap(i, j);
|
---|
1871 | }
|
---|
1872 |
|
---|
1873 | /**
|
---|
1874 | * Merges two sets of Instances together. The resulting set will have
|
---|
1875 | * all the attributes of the first set plus all the attributes of the
|
---|
1876 | * second set. The number of instances in both sets must be the same.
|
---|
1877 | *
|
---|
1878 | * @param first the first set of Instances
|
---|
1879 | * @param second the second set of Instances
|
---|
1880 | * @return the merged set of Instances
|
---|
1881 | * @exception IllegalArgumentException if the datasets are not the same size
|
---|
1882 | */
|
---|
1883 | public static Instances mergeInstances(Instances first, Instances second) {
|
---|
1884 |
|
---|
1885 | if (first.numInstances() != second.numInstances()) {
|
---|
1886 | throw new IllegalArgumentException("Instance sets must be of the same size");
|
---|
1887 | }
|
---|
1888 |
|
---|
1889 | // Create the vector of merged attributes
|
---|
1890 | FastVector newAttributes = new FastVector();
|
---|
1891 | for (int i = 0; i < first.numAttributes(); i++) {
|
---|
1892 | newAttributes.addElement(first.attribute(i));
|
---|
1893 | }
|
---|
1894 | for (int i = 0; i < second.numAttributes(); i++) {
|
---|
1895 | newAttributes.addElement(second.attribute(i));
|
---|
1896 | }
|
---|
1897 |
|
---|
1898 | // Create the set of Instances
|
---|
1899 | Instances merged = new Instances(first.relationName() + '_'
|
---|
1900 | + second.relationName(),
|
---|
1901 | newAttributes,
|
---|
1902 | first.numInstances());
|
---|
1903 | // Merge each instance
|
---|
1904 | for (int i = 0; i < first.numInstances(); i++) {
|
---|
1905 | merged.add(first.instance(i).mergeInstance(second.instance(i)));
|
---|
1906 | }
|
---|
1907 | return merged;
|
---|
1908 | }
|
---|
1909 |
|
---|
1910 | /**
|
---|
1911 | * Method for testing this class.
|
---|
1912 | *
|
---|
1913 | * @param argv should contain one element: the name of an ARFF file
|
---|
1914 | */
|
---|
1915 | public static void test(String [] argv) {
|
---|
1916 |
|
---|
1917 | Instances instances, secondInstances, train, test, transformed, empty;
|
---|
1918 | Instance instance;
|
---|
1919 | Random random = new Random(2);
|
---|
1920 | Reader reader;
|
---|
1921 | int start, num;
|
---|
1922 | double newWeight;
|
---|
1923 | FastVector testAtts, testVals;
|
---|
1924 | int i,j;
|
---|
1925 |
|
---|
1926 | try{
|
---|
1927 | if (argv.length > 1) {
|
---|
1928 | throw (new Exception("Usage: Instances [<filename>]"));
|
---|
1929 | }
|
---|
1930 |
|
---|
1931 | // Creating set of instances from scratch
|
---|
1932 | testVals = new FastVector(2);
|
---|
1933 | testVals.addElement("first_value");
|
---|
1934 | testVals.addElement("second_value");
|
---|
1935 | testAtts = new FastVector(2);
|
---|
1936 | testAtts.addElement(new Attribute("nominal_attribute", testVals));
|
---|
1937 | testAtts.addElement(new Attribute("numeric_attribute"));
|
---|
1938 | instances = new Instances("test_set", testAtts, 10);
|
---|
1939 | instances.add(new Instance(instances.numAttributes()));
|
---|
1940 | instances.add(new Instance(instances.numAttributes()));
|
---|
1941 | instances.add(new Instance(instances.numAttributes()));
|
---|
1942 | instances.setClassIndex(0);
|
---|
1943 | System.out.println("\nSet of instances created from scratch:\n");
|
---|
1944 | System.out.println(instances);
|
---|
1945 |
|
---|
1946 | if (argv.length == 1) {
|
---|
1947 | String filename = argv[0];
|
---|
1948 | reader = new FileReader(filename);
|
---|
1949 |
|
---|
1950 | // Read first five instances and print them
|
---|
1951 | System.out.println("\nFirst five instances from file:\n");
|
---|
1952 | instances = new Instances(reader, 1);
|
---|
1953 | instances.setClassIndex(instances.numAttributes() - 1);
|
---|
1954 | i = 0;
|
---|
1955 | while ((i < 5) && (instances.readInstance(reader))) {
|
---|
1956 | i++;
|
---|
1957 | }
|
---|
1958 | System.out.println(instances);
|
---|
1959 |
|
---|
1960 | // Read all the instances in the file
|
---|
1961 | reader = new FileReader(filename);
|
---|
1962 | instances = new Instances(reader);
|
---|
1963 |
|
---|
1964 | // Make the last attribute be the class
|
---|
1965 | instances.setClassIndex(instances.numAttributes() - 1);
|
---|
1966 |
|
---|
1967 | // Print header and instances.
|
---|
1968 | System.out.println("\nDataset:\n");
|
---|
1969 | System.out.println(instances);
|
---|
1970 | System.out.println("\nClass index: "+instances.classIndex());
|
---|
1971 | }
|
---|
1972 |
|
---|
1973 | // Test basic methods based on class index.
|
---|
1974 | System.out.println("\nClass name: "+instances.classAttribute().name());
|
---|
1975 | System.out.println("\nClass index: "+instances.classIndex());
|
---|
1976 | System.out.println("\nClass is nominal: " +
|
---|
1977 | instances.classAttribute().isNominal());
|
---|
1978 | System.out.println("\nClass is numeric: " +
|
---|
1979 | instances.classAttribute().isNumeric());
|
---|
1980 | System.out.println("\nClasses:\n");
|
---|
1981 | for (i = 0; i < instances.numClasses(); i++) {
|
---|
1982 | System.out.println(instances.classAttribute().value(i));
|
---|
1983 | }
|
---|
1984 | System.out.println("\nClass values and labels of instances:\n");
|
---|
1985 | for (i = 0; i < instances.numInstances(); i++) {
|
---|
1986 | Instance inst = instances.instance(i);
|
---|
1987 | System.out.print(inst.classValue() + "\t");
|
---|
1988 | System.out.print(inst.toString(inst.classIndex()));
|
---|
1989 | if (instances.instance(i).classIsMissing()) {
|
---|
1990 | System.out.println("\tis missing");
|
---|
1991 | } else {
|
---|
1992 | System.out.println();
|
---|
1993 | }
|
---|
1994 | }
|
---|
1995 |
|
---|
1996 | // Create random weights.
|
---|
1997 | System.out.println("\nCreating random weights for instances.");
|
---|
1998 | for (i = 0; i < instances.numInstances(); i++) {
|
---|
1999 | instances.instance(i).setWeight(random.nextDouble());
|
---|
2000 | }
|
---|
2001 |
|
---|
2002 | // Print all instances and their weights (and the sum of weights).
|
---|
2003 | System.out.println("\nInstances and their weights:\n");
|
---|
2004 | System.out.println(instances.instancesAndWeights());
|
---|
2005 | System.out.print("\nSum of weights: ");
|
---|
2006 | System.out.println(instances.sumOfWeights());
|
---|
2007 |
|
---|
2008 | // Insert an attribute
|
---|
2009 | secondInstances = new Instances(instances);
|
---|
2010 | Attribute testAtt = new Attribute("Inserted");
|
---|
2011 | secondInstances.insertAttributeAt(testAtt, 0);
|
---|
2012 | System.out.println("\nSet with inserted attribute:\n");
|
---|
2013 | System.out.println(secondInstances);
|
---|
2014 | System.out.println("\nClass name: "
|
---|
2015 | + secondInstances.classAttribute().name());
|
---|
2016 |
|
---|
2017 | // Delete the attribute
|
---|
2018 | secondInstances.deleteAttributeAt(0);
|
---|
2019 | System.out.println("\nSet with attribute deleted:\n");
|
---|
2020 | System.out.println(secondInstances);
|
---|
2021 | System.out.println("\nClass name: "
|
---|
2022 | + secondInstances.classAttribute().name());
|
---|
2023 |
|
---|
2024 | // Test if headers are equal
|
---|
2025 | System.out.println("\nHeaders equal: "+
|
---|
2026 | instances.equalHeaders(secondInstances) + "\n");
|
---|
2027 |
|
---|
2028 | // Print data in internal format.
|
---|
2029 | System.out.println("\nData (internal values):\n");
|
---|
2030 | for (i = 0; i < instances.numInstances(); i++) {
|
---|
2031 | for (j = 0; j < instances.numAttributes(); j++) {
|
---|
2032 | if (instances.instance(i).isMissing(j)) {
|
---|
2033 | System.out.print("? ");
|
---|
2034 | } else {
|
---|
2035 | System.out.print(instances.instance(i).value(j) + " ");
|
---|
2036 | }
|
---|
2037 | }
|
---|
2038 | System.out.println();
|
---|
2039 | }
|
---|
2040 |
|
---|
2041 | // Just print header
|
---|
2042 | System.out.println("\nEmpty dataset:\n");
|
---|
2043 | empty = new Instances(instances, 0);
|
---|
2044 | System.out.println(empty);
|
---|
2045 | System.out.println("\nClass name: "+empty.classAttribute().name());
|
---|
2046 |
|
---|
2047 | // Create copy and rename an attribute and a value (if possible)
|
---|
2048 | if (empty.classAttribute().isNominal()) {
|
---|
2049 | Instances copy = new Instances(empty, 0);
|
---|
2050 | copy.renameAttribute(copy.classAttribute(), "new_name");
|
---|
2051 | copy.renameAttributeValue(copy.classAttribute(),
|
---|
2052 | copy.classAttribute().value(0),
|
---|
2053 | "new_val_name");
|
---|
2054 | System.out.println("\nDataset with names changed:\n" + copy);
|
---|
2055 | System.out.println("\nOriginal dataset:\n" + empty);
|
---|
2056 | }
|
---|
2057 |
|
---|
2058 | // Create and prints subset of instances.
|
---|
2059 | start = instances.numInstances() / 4;
|
---|
2060 | num = instances.numInstances() / 2;
|
---|
2061 | System.out.print("\nSubset of dataset: ");
|
---|
2062 | System.out.println(num + " instances from " + (start + 1)
|
---|
2063 | + ". instance");
|
---|
2064 | secondInstances = new Instances(instances, start, num);
|
---|
2065 | System.out.println("\nClass name: "
|
---|
2066 | + secondInstances.classAttribute().name());
|
---|
2067 |
|
---|
2068 | // Print all instances and their weights (and the sum of weights).
|
---|
2069 | System.out.println("\nInstances and their weights:\n");
|
---|
2070 | System.out.println(secondInstances.instancesAndWeights());
|
---|
2071 | System.out.print("\nSum of weights: ");
|
---|
2072 | System.out.println(secondInstances.sumOfWeights());
|
---|
2073 |
|
---|
2074 | // Create and print training and test sets for 3-fold
|
---|
2075 | // cross-validation.
|
---|
2076 | System.out.println("\nTrain and test folds for 3-fold CV:");
|
---|
2077 | if (instances.classAttribute().isNominal()) {
|
---|
2078 | instances.stratify(3);
|
---|
2079 | }
|
---|
2080 | for (j = 0; j < 3; j++) {
|
---|
2081 | train = instances.trainCV(3,j);
|
---|
2082 | test = instances.testCV(3,j);
|
---|
2083 |
|
---|
2084 | // Print all instances and their weights (and the sum of weights).
|
---|
2085 | System.out.println("\nTrain: ");
|
---|
2086 | System.out.println("\nInstances and their weights:\n");
|
---|
2087 | System.out.println(train.instancesAndWeights());
|
---|
2088 | System.out.print("\nSum of weights: ");
|
---|
2089 | System.out.println(train.sumOfWeights());
|
---|
2090 | System.out.println("\nClass name: "+train.classAttribute().name());
|
---|
2091 | System.out.println("\nTest: ");
|
---|
2092 | System.out.println("\nInstances and their weights:\n");
|
---|
2093 | System.out.println(test.instancesAndWeights());
|
---|
2094 | System.out.print("\nSum of weights: ");
|
---|
2095 | System.out.println(test.sumOfWeights());
|
---|
2096 | System.out.println("\nClass name: "+test.classAttribute().name());
|
---|
2097 | }
|
---|
2098 |
|
---|
2099 | // Randomize instances and print them.
|
---|
2100 | System.out.println("\nRandomized dataset:");
|
---|
2101 | instances.randomize(random);
|
---|
2102 |
|
---|
2103 | // Print all instances and their weights (and the sum of weights).
|
---|
2104 | System.out.println("\nInstances and their weights:\n");
|
---|
2105 | System.out.println(instances.instancesAndWeights());
|
---|
2106 | System.out.print("\nSum of weights: ");
|
---|
2107 | System.out.println(instances.sumOfWeights());
|
---|
2108 |
|
---|
2109 | // Sort instances according to first attribute and
|
---|
2110 | // print them.
|
---|
2111 | System.out.print("\nInstances sorted according to first attribute:\n ");
|
---|
2112 | instances.sort(0);
|
---|
2113 |
|
---|
2114 | // Print all instances and their weights (and the sum of weights).
|
---|
2115 | System.out.println("\nInstances and their weights:\n");
|
---|
2116 | System.out.println(instances.instancesAndWeights());
|
---|
2117 | System.out.print("\nSum of weights: ");
|
---|
2118 | System.out.println(instances.sumOfWeights());
|
---|
2119 | } catch (Exception e) {
|
---|
2120 | e.printStackTrace();
|
---|
2121 | }
|
---|
2122 | }
|
---|
2123 |
|
---|
2124 | /**
|
---|
2125 | * Main method for this class -- just prints a summary of a set
|
---|
2126 | * of instances.
|
---|
2127 | *
|
---|
2128 | * @param argv should contain one element: the name of an ARFF file
|
---|
2129 | */
|
---|
2130 | public static void main(String [] args) {
|
---|
2131 |
|
---|
2132 | try {
|
---|
2133 | Reader r = null;
|
---|
2134 | if (args.length > 1) {
|
---|
2135 | throw (new Exception("Usage: Instances <filename>"));
|
---|
2136 | } else if (args.length == 0) {
|
---|
2137 | r = new BufferedReader(new InputStreamReader(System.in));
|
---|
2138 | } else {
|
---|
2139 | r = new BufferedReader(new FileReader(args[0]));
|
---|
2140 | }
|
---|
2141 | Instances i = new Instances(r);
|
---|
2142 | System.out.println(i.toSummaryString());
|
---|
2143 | } catch (Exception ex) {
|
---|
2144 | System.err.println(ex.getMessage());
|
---|
2145 | }
|
---|
2146 | }
|
---|
2147 | }
|
---|
2148 |
|
---|
2149 |
|
---|
2150 |
|
---|