source: trunk/gsdl/packages/kea/kea-3.0/weka/core/ContingencyTables.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * ContingencyTables.java
19 * Copyright (C) 1999 Eibe Frank
20 *
21 */
22
23package weka.core;
24
25/**
26 * Class implementing some statistical routines for contingency tables.
27 *
28 * @author Eibe Frank ([email protected])
29 * @version $Revision: 8815 $
30 */
31public class ContingencyTables {
32
33 /** The natural logarithm of 2 */
34 private static double log2 = Math.log(2);
35
36 /**
37 * Returns chi-squared probability for a given matrix.
38 *
39 * @param matrix the contigency table
40 * @param yates is Yates' correction to be used?
41 * @return the chi-squared probability
42 */
43
44 public static double chiSquared(double [][] matrix, boolean yates) {
45
46 int df = (matrix.length - 1) * (matrix[0].length - 1);
47
48 return Statistics.chiSquaredProbability(chiVal(matrix, yates), df);
49 }
50
51 /**
52 * Computes chi-squared statistic for a contingency table.
53 *
54 * @param matrix the contigency table
55 * @param yates is Yates' correction to be used?
56 * @return the value of the chi-squared statistic
57 */
58 public static double chiVal(double [][] matrix, boolean useYates) {
59
60 int df, nrows, ncols, row, col;
61 double[] rtotal, ctotal;
62 double expect = 0, chival = 0, n = 0;
63 boolean yates = true;
64
65 nrows = matrix.length;
66 ncols = matrix[0].length;
67 rtotal = new double [nrows];
68 ctotal = new double [ncols];
69 for (row = 0; row < nrows; row++) {
70 for (col = 0; col < ncols; col++) {
71 rtotal[row] += matrix[row][col];
72 ctotal[col] += matrix[row][col];
73 n += matrix[row][col];
74 }
75 }
76 df = (nrows - 1)*(ncols - 1);
77 if ((df > 1) || (!useYates)) {
78 yates = false;
79 } else if (df <= 0) {
80 return 0;
81 }
82 chival = 0.0;
83 for (row = 0; row < nrows; row++) {
84 if (Utils.gr(rtotal[row], 0)) {
85 for (col = 0; col < ncols; col++) {
86 if (Utils.gr(ctotal[col], 0)) {
87 expect = (ctotal[col] * rtotal[row]) / n;
88 chival += chiCell (matrix[row][col], expect, yates);
89 }
90 }
91 }
92 }
93 return chival;
94 }
95
96 /**
97 * Tests if Cochran's criterion is fullfilled for the given
98 * contingency table. Rows and columns with all zeros are not considered
99 * relevant.
100 *
101 * @param matrix the contigency table to be tested
102 * @return true if contingency table is ok, false if not
103 */
104 public static boolean cochransCriterion(double[][] matrix) {
105
106 double[] rtotal, ctotal;
107 double n = 0, expect, smallfreq = 5;
108 int smallcount = 0, nonZeroRows = 0, nonZeroColumns = 0, nrows, ncols,
109 row, col;
110
111 nrows = matrix.length;
112 ncols = matrix[0].length;
113
114 rtotal = new double [nrows];
115 ctotal = new double [ncols];
116 for (row = 0; row < nrows; row++) {
117 for (col = 0; col < ncols; col++) {
118 rtotal[row] += matrix[row][col];
119 ctotal[col] += matrix[row][col];
120 n += matrix[row][col];
121 }
122 }
123 for (row = 0; row < nrows; row++) {
124 if (Utils.gr(rtotal[row], 0)) {
125 nonZeroRows++;
126 }
127 }
128 for (col = 0; col < ncols; col++) {
129 if (Utils.gr(ctotal[col], 0)) {
130 nonZeroColumns++;
131 }
132 }
133 for (row = 0; row < nrows; row++) {
134 if (Utils.gr(rtotal[row], 0)) {
135 for (col = 0; col < ncols; col++) {
136 if (Utils.gr(ctotal[col], 0)) {
137 expect = (ctotal[col] * rtotal[row]) / n;
138 if (Utils.sm(expect, smallfreq)) {
139 if (Utils.sm(expect, 1)) {
140 return false;
141 } else {
142 smallcount++;
143 if (smallcount > (nonZeroRows * nonZeroColumns) / smallfreq) {
144 return false;
145 }
146 }
147 }
148 }
149 }
150 }
151 }
152 return true;
153 }
154
155 /**
156 * Computes Cramer's V for a contingency table.
157 *
158 * @param matrix the contingency table
159 * @return Cramer's V
160 */
161 public static double CramersV(double [][] matrix) {
162
163 int row, col, nrows,ncols, min;
164 double n = 0;
165
166 nrows = matrix.length;
167 ncols = matrix[0].length;
168 for (row = 0; row < nrows; row++) {
169 for (col = 0; col < ncols; col++) {
170 n += matrix[row][col];
171 }
172 }
173 min = nrows < ncols ? nrows-1 : ncols-1;
174 if ((min == 0) || Utils.eq(n, 0))
175 return 0;
176 return Math.sqrt(chiVal(matrix, false) / (n * (double)min));
177 }
178
179 /**
180 * Computes the entropy of the given array.
181 *
182 * @param array the array
183 * @return the entropy
184 */
185 public static double entropy(double[] array) {
186
187 double returnValue = 0, sum = 0;
188
189 for (int i = 0; i < array.length; i++) {
190 returnValue -= lnFunc(array[i]);
191 sum += array[i];
192 }
193 if (Utils.eq(sum, 0)) {
194 return 0;
195 } else {
196 return (returnValue + lnFunc(sum)) / (sum * log2);
197 }
198 }
199
200 /**
201 * Computes conditional entropy of the rows given
202 * the columns.
203 *
204 * @param matrix the contingency table
205 * @return the conditional entropy of the rows given the columns
206 */
207 public static double entropyConditionedOnColumns(double[][] matrix) {
208
209 double returnValue = 0, sumForColumn, total = 0;
210
211 for (int j = 0; j < matrix[0].length; j++) {
212 sumForColumn = 0;
213 for (int i = 0; i < matrix.length; i++) {
214 returnValue = returnValue + lnFunc(matrix[i][j]);
215 sumForColumn += matrix[i][j];
216 }
217 returnValue = returnValue - lnFunc(sumForColumn);
218 total += sumForColumn;
219 }
220 if (Utils.eq(total, 0)) {
221 return 0;
222 }
223 return -returnValue / (total * log2);
224 }
225
226 /**
227 * Computes conditional entropy of the columns given
228 * the rows.
229 *
230 * @param matrix the contingency table
231 * @return the conditional entropy of the columns given the rows
232 */
233 public static double entropyConditionedOnRows(double[][] matrix) {
234
235 double returnValue = 0, sumForRow, total = 0;
236
237 for (int i = 0; i < matrix.length; i++) {
238 sumForRow = 0;
239 for (int j = 0; j < matrix[0].length; j++) {
240 returnValue = returnValue + lnFunc(matrix[i][j]);
241 sumForRow += matrix[i][j];
242 }
243 returnValue = returnValue - lnFunc(sumForRow);
244 total += sumForRow;
245 }
246 if (Utils.eq(total, 0)) {
247 return 0;
248 }
249 return -returnValue / (total * log2);
250 }
251
252 /**
253 * Computes conditional entropy of the columns given the rows
254 * of the test matrix with respect to the train matrix. Uses a
255 * Laplace prior. Does NOT normalize the entropy.
256 *
257 * @param train the train matrix
258 * @param test the test matrix
259 * @param the number of symbols for Laplace
260 * @return the entropy
261 */
262 public static double entropyConditionedOnRows(double[][] train,
263 double[][] test,
264 double numClasses) {
265
266 double returnValue = 0, trainSumForRow, testSumForRow, testSum = 0;
267
268 for (int i = 0; i < test.length; i++) {
269 trainSumForRow = 0;
270 testSumForRow = 0;
271 for (int j = 0; j < test[0].length; j++) {
272 returnValue -= test[i][j] * Math.log(train[i][j] + 1);
273 trainSumForRow += train[i][j];
274 testSumForRow += test[i][j];
275 }
276 testSum = testSumForRow;
277 returnValue += testSumForRow * Math.log(trainSumForRow +
278 numClasses);
279 }
280 return returnValue / (testSum * log2);
281 }
282
283 /**
284 * Computes the rows' entropy for the given contingency table.
285 *
286 * @param matrix the contingency table
287 * @return the rows' entropy
288 */
289 public static double entropyOverRows(double[][] matrix) {
290
291 double returnValue = 0, sumForRow, total = 0;
292
293 for (int i = 0; i < matrix.length; i++) {
294 sumForRow = 0;
295 for (int j = 0; j < matrix[0].length; j++) {
296 sumForRow += matrix[i][j];
297 }
298 returnValue = returnValue - lnFunc(sumForRow);
299 total += sumForRow;
300 }
301 if (Utils.eq(total, 0)) {
302 return 0;
303 }
304 return (returnValue + lnFunc(total)) / (total * log2);
305 }
306
307 /**
308 * Computes the columns' entropy for the given contingency table.
309 *
310 * @param matrix the contingency table
311 * @return the columns' entropy
312 */
313 public static double entropyOverColumns(double[][] matrix){
314
315 double returnValue = 0, sumForColumn, total = 0;
316
317 for (int j = 0; j < matrix[0].length; j++){
318 sumForColumn = 0;
319 for (int i = 0; i < matrix.length; i++) {
320 sumForColumn += matrix[i][j];
321 }
322 returnValue = returnValue - lnFunc(sumForColumn);
323 total += sumForColumn;
324 }
325 if (Utils.eq(total, 0)) {
326 return 0;
327 }
328 return (returnValue + lnFunc(total)) / (total * log2);
329 }
330
331 /**
332 * Computes gain ratio for contingency table (split on rows).
333 * Returns Double.MAX_VALUE if the split entropy is 0.
334 *
335 * @param matrix the contingency table
336 * @return the gain ratio
337 */
338 public static double gainRatio(double[][] matrix){
339
340 double preSplit = 0, postSplit = 0, splitEnt = 0,
341 sumForRow, sumForColumn, total = 0, infoGain;
342
343 // Compute entropy before split
344 for (int i = 0; i < matrix[0].length; i++) {
345 sumForColumn = 0;
346 for (int j = 0; j < matrix.length; j++)
347 sumForColumn += matrix[j][i];
348 preSplit += lnFunc(sumForColumn);
349 total += sumForColumn;
350 }
351 preSplit -= lnFunc(total);
352
353 // Compute entropy after split and split entropy
354 for (int i = 0; i < matrix.length; i++) {
355 sumForRow = 0;
356 for (int j = 0; j < matrix[0].length; j++) {
357 postSplit += lnFunc(matrix[i][j]);
358 sumForRow += matrix[i][j];
359 }
360 splitEnt += lnFunc(sumForRow);
361 }
362 postSplit -= splitEnt;
363 splitEnt -= lnFunc(total);
364
365 infoGain = preSplit - postSplit;
366 if (Utils.eq(splitEnt, 0))
367 return 0;
368 return infoGain / splitEnt;
369 }
370
371 /**
372 * Returns negative base 2 logarithm of multiple hypergeometric
373 * probability for a contingency table.
374 *
375 * @param matrix the contingency table
376 * @return the log of the hypergeometric probability of the contingency table
377 */
378 public static double log2MultipleHypergeometric(double[][] matrix) {
379
380 double sum = 0, sumForRow, sumForColumn, total = 0;
381
382 for (int i = 0; i < matrix.length; i++) {
383 sumForRow = 0;
384 for (int j = 0; j < matrix[i].length; j++) {
385 sumForRow += matrix[i][j];
386 }
387 sum += SpecialFunctions.lnFactorial(sumForRow);
388 total += sumForRow;
389 }
390 for (int j = 0; j < matrix[0].length; j++) {
391 sumForColumn = 0;
392 for (int i = 0; i < matrix.length; i++) {
393 sumForColumn += matrix [i][j];
394 }
395 sum += SpecialFunctions.lnFactorial(sumForColumn);
396 }
397 for (int i = 0; i < matrix.length; i++) {
398 for (int j = 0; j < matrix[i].length; j++) {
399 sum -= SpecialFunctions.lnFactorial(matrix[i][j]);
400 }
401 }
402 sum -= SpecialFunctions.lnFactorial(total);
403 return -sum / log2;
404 }
405
406 /**
407 * Reduces a matrix by deleting all zero rows and columns.
408 *
409 * @param matrix the matrix to be reduced
410 * @param the matrix with all zero rows and columns deleted
411 */
412 public static double[][] reduceMatrix(double[][] matrix) {
413
414 int row, col, currCol, currRow, nrows, ncols,
415 nonZeroRows = 0, nonZeroColumns = 0;
416 double[] rtotal, ctotal;
417 double[][] newMatrix;
418
419 nrows = matrix.length;
420 ncols = matrix[0].length;
421 rtotal = new double [nrows];
422 ctotal = new double [ncols];
423 for (row = 0; row < nrows; row++) {
424 for (col = 0; col < ncols; col++) {
425 rtotal[row] += matrix[row][col];
426 ctotal[col] += matrix[row][col];
427 }
428 }
429 for (row = 0; row < nrows; row++) {
430 if (Utils.gr(rtotal[row],0)) {
431 nonZeroRows++;
432 }
433 }
434 for (col = 0; col < ncols; col++) {
435 if (Utils.gr(ctotal[col],0)) {
436 nonZeroColumns++;
437 }
438 }
439 newMatrix = new double[nonZeroRows][nonZeroColumns];
440 currRow = 0;
441 for (row = 0; row < nrows; row++) {
442 if (Utils.gr(rtotal[row],0)) {
443 currCol = 0;
444 for (col = 0; col < ncols; col++) {
445 if (Utils.gr(ctotal[col],0)) {
446 newMatrix[currRow][currCol] = matrix[row][col];
447 currCol++;
448 }
449 }
450 currRow++;
451 }
452 }
453 return newMatrix;
454 }
455
456 /**
457 * Calculates the symmetrical uncertainty for base 2.
458 *
459 * @param matrix the contingency table
460 * @return the calculated symmetrical uncertainty
461 *
462 */
463 public static double symmetricalUncertainty(double matrix[][]) {
464
465 double sumForColumn, sumForRow, total = 0, columnEntropy = 0,
466 rowEntropy = 0, entropyConditionedOnRows = 0, infoGain = 0;
467
468 // Compute entropy for columns
469 for (int i = 0; i < matrix[0].length; i++) {
470 sumForColumn = 0;
471 for (int j = 0; j < matrix.length; j++) {
472 sumForColumn += matrix[j][i];
473 }
474 columnEntropy += lnFunc(sumForColumn);
475 total += sumForColumn;
476 }
477 columnEntropy -= lnFunc(total);
478
479 // Compute entropy for rows and conditional entropy
480 for (int i = 0; i < matrix.length; i++) {
481 sumForRow = 0;
482 for (int j = 0; j < matrix[0].length; j++) {
483 sumForRow += matrix[i][j];
484 entropyConditionedOnRows += lnFunc(matrix[i][j]);
485 }
486 rowEntropy += lnFunc(sumForRow);
487 }
488 entropyConditionedOnRows -= rowEntropy;
489 rowEntropy -= lnFunc(total);
490 infoGain = columnEntropy - entropyConditionedOnRows;
491 if (Utils.eq(columnEntropy, 0) || Utils.eq(rowEntropy, 0))
492 return 0;
493 return 2.0 * (infoGain / (columnEntropy + rowEntropy));
494 }
495
496 /**
497 * Computes Goodman and Kruskal's tau-value for a contingency table.
498 *
499 * @param matrix the contingency table
500 * @param Goodman and Kruskal's tau-value
501 */
502 public static double tauVal(double[][] matrix) {
503
504 int nrows, ncols, row, col;
505 double [] ctotal;
506 double maxcol = 0, max, maxtotal = 0, n = 0;
507
508 nrows = matrix.length;
509 ncols = matrix[0].length;
510 ctotal = new double [ncols];
511 for (row = 0; row < nrows; row++) {
512 max = 0;
513 for (col = 0; col < ncols; col++) {
514 if (Utils.gr(matrix[row][col], max))
515 max = matrix[row][col];
516 ctotal[col] += matrix[row][col];
517 n += matrix[row][col];
518 }
519 maxtotal += max;
520 }
521 if (Utils.eq(n, 0)) {
522 return 0;
523 }
524 maxcol = ctotal[Utils.maxIndex(ctotal)];
525 return (maxtotal - maxcol)/(n - maxcol);
526 }
527
528 /**
529 * Help method for computing entropy.
530 */
531 private static double lnFunc(double num){
532
533 // Constant hard coded for efficiency reasons
534 if (num < 1e-6) {
535 return 0;
536 } else {
537 return num * Math.log(num);
538 }
539 }
540
541 /**
542 * Computes chi-value for one cell in matrix.
543 * From Gary Perlman's unixstat.
544 */
545 private static double chiCell(double freq, double expect, boolean yates){
546
547 double diff = freq - expect;
548
549 if (yates) {
550 diff = Math.abs (diff) - 0.5;
551 if (diff < 0.0) { // over-correction
552 diff = 0.0;
553 }
554 }
555 if (Math.abs(expect) < 10e-10) {
556 return (0.0);
557 } else {
558 return (diff * diff / expect);
559 }
560 }
561
562 /**
563 * Main method for testing this class.
564 */
565 public static void main(String[] ops) {
566
567 double[] firstRow = {10, 5, 20};
568 double[] secondRow = {2, 10, 6};
569 double[] thirdRow = {5, 10, 10};
570 double[][] matrix = new double[3][0];
571
572 matrix[0] = firstRow; matrix[1] = secondRow; matrix[2] = thirdRow;
573 for (int i = 0; i < matrix.length; i++) {
574 for (int j = 0; j < matrix[i].length; j++) {
575 System.out.print(matrix[i][j] + " ");
576 }
577 System.out.println();
578 }
579 System.out.println("Chi-squared probability: " +
580 ContingencyTables.chiSquared(matrix, false));
581 System.out.println("Chi-squared value: " +
582 ContingencyTables.chiVal(matrix, false));
583 System.out.println("Cochran's criterion fullfilled: " +
584 ContingencyTables.cochransCriterion(matrix));
585 System.out.println("Cramer's V: " +
586 ContingencyTables.CramersV(matrix));
587 System.out.println("Entropy of first row: " +
588 ContingencyTables.entropy(firstRow));
589 System.out.println("Entropy conditioned on columns: " +
590 ContingencyTables.entropyConditionedOnColumns(matrix));
591 System.out.println("Entropy conditioned on rows: " +
592 ContingencyTables.entropyConditionedOnRows(matrix));
593 System.out.println("Entropy conditioned on rows (with Laplace): " +
594 ContingencyTables.entropyConditionedOnRows(matrix, matrix, 3));
595 System.out.println("Entropy of rows: " +
596 ContingencyTables.entropyOverRows(matrix));
597 System.out.println("Entropy of columns: " +
598 ContingencyTables.entropyOverColumns(matrix));
599 System.out.println("Gain ratio: " +
600 ContingencyTables.gainRatio(matrix));
601 System.out.println("Negative log2 of multiple hypergeometric probability: " +
602 ContingencyTables.log2MultipleHypergeometric(matrix));
603 System.out.println("Symmetrical uncertainty: " +
604 ContingencyTables.symmetricalUncertainty(matrix));
605 System.out.println("Tau value: " +
606 ContingencyTables.tauVal(matrix));
607 double[][] newMatrix = new double[3][3];
608 newMatrix[0][0] = 1; newMatrix[0][1] = 0; newMatrix[0][2] = 1;
609 newMatrix[1][0] = 0; newMatrix[1][1] = 0; newMatrix[1][2] = 0;
610 newMatrix[2][0] = 1; newMatrix[2][1] = 0; newMatrix[2][2] = 1;
611 System.out.println("Matrix with empty row and column: ");
612 for (int i = 0; i < newMatrix.length; i++) {
613 for (int j = 0; j < newMatrix[i].length; j++) {
614 System.out.print(newMatrix[i][j] + " ");
615 }
616 System.out.println();
617 }
618 System.out.println("Reduced matrix: ");
619 newMatrix = ContingencyTables.reduceMatrix(newMatrix);
620 for (int i = 0; i < newMatrix.length; i++) {
621 for (int j = 0; j < newMatrix[i].length; j++) {
622 System.out.print(newMatrix[i][j] + " ");
623 }
624 System.out.println();
625 }
626 }
627}
628
629
630
631
632
633
634
635
Note: See TracBrowser for help on using the repository browser.