Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/kea/kea-3.0/weka/core/ContingencyTables.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago
Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.6 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* ContingencyTables.java
19	* Copyright (C) 1999 Eibe Frank
20	*
21	*/
22
23	package weka.core;
24
25	/**
26	* Class implementing some statistical routines for contingency tables.
27	*
28	* @author Eibe Frank ([email protected])
29	* @version $Revision: 8815 $
30	*/
31	public class ContingencyTables {
32
33	/** The natural logarithm of 2 */
34	private static double log2 = Math.log(2);
35
36	/**
37	* Returns chi-squared probability for a given matrix.
38	*
39	* @param matrix the contigency table
40	* @param yates is Yates' correction to be used?
41	* @return the chi-squared probability
42	*/
43
44	public static double chiSquared(double [][] matrix, boolean yates) {
45
46	int df = (matrix.length - 1) * (matrix[0].length - 1);
47
48	return Statistics.chiSquaredProbability(chiVal(matrix, yates), df);
49	}
50
51	/**
52	* Computes chi-squared statistic for a contingency table.
53	*
54	* @param matrix the contigency table
55	* @param yates is Yates' correction to be used?
56	* @return the value of the chi-squared statistic
57	*/
58	public static double chiVal(double [][] matrix, boolean useYates) {
59
60	int df, nrows, ncols, row, col;
61	double[] rtotal, ctotal;
62	double expect = 0, chival = 0, n = 0;
63	boolean yates = true;
64
65	nrows = matrix.length;
66	ncols = matrix[0].length;
67	rtotal = new double [nrows];
68	ctotal = new double [ncols];
69	for (row = 0; row < nrows; row++) {
70	for (col = 0; col < ncols; col++) {
71	rtotal[row] += matrix[row][col];
72	ctotal[col] += matrix[row][col];
73	n += matrix[row][col];
74	}
75	}
76	df = (nrows - 1)*(ncols - 1);
77	if ((df > 1) \|\| (!useYates)) {
78	yates = false;
79	} else if (df <= 0) {
80	return 0;
81	}
82	chival = 0.0;
83	for (row = 0; row < nrows; row++) {
84	if (Utils.gr(rtotal[row], 0)) {
85	for (col = 0; col < ncols; col++) {
86	if (Utils.gr(ctotal[col], 0)) {
87	expect = (ctotal[col] * rtotal[row]) / n;
88	chival += chiCell (matrix[row][col], expect, yates);
89	}
90	}
91	}
92	}
93	return chival;
94	}
95
96	/**
97	* Tests if Cochran's criterion is fullfilled for the given
98	* contingency table. Rows and columns with all zeros are not considered
99	* relevant.
100	*
101	* @param matrix the contigency table to be tested
102	* @return true if contingency table is ok, false if not
103	*/
104	public static boolean cochransCriterion(double[][] matrix) {
105
106	double[] rtotal, ctotal;
107	double n = 0, expect, smallfreq = 5;
108	int smallcount = 0, nonZeroRows = 0, nonZeroColumns = 0, nrows, ncols,
109	row, col;
110
111	nrows = matrix.length;
112	ncols = matrix[0].length;
113
114	rtotal = new double [nrows];
115	ctotal = new double [ncols];
116	for (row = 0; row < nrows; row++) {
117	for (col = 0; col < ncols; col++) {
118	rtotal[row] += matrix[row][col];
119	ctotal[col] += matrix[row][col];
120	n += matrix[row][col];
121	}
122	}
123	for (row = 0; row < nrows; row++) {
124	if (Utils.gr(rtotal[row], 0)) {
125	nonZeroRows++;
126	}
127	}
128	for (col = 0; col < ncols; col++) {
129	if (Utils.gr(ctotal[col], 0)) {
130	nonZeroColumns++;
131	}
132	}
133	for (row = 0; row < nrows; row++) {
134	if (Utils.gr(rtotal[row], 0)) {
135	for (col = 0; col < ncols; col++) {
136	if (Utils.gr(ctotal[col], 0)) {
137	expect = (ctotal[col] * rtotal[row]) / n;
138	if (Utils.sm(expect, smallfreq)) {
139	if (Utils.sm(expect, 1)) {
140	return false;
141	} else {
142	smallcount++;
143	if (smallcount > (nonZeroRows * nonZeroColumns) / smallfreq) {
144	return false;
145	}
146	}
147	}
148	}
149	}
150	}
151	}
152	return true;
153	}
154
155	/**
156	* Computes Cramer's V for a contingency table.
157	*
158	* @param matrix the contingency table
159	* @return Cramer's V
160	*/
161	public static double CramersV(double [][] matrix) {
162
163	int row, col, nrows,ncols, min;
164	double n = 0;
165
166	nrows = matrix.length;
167	ncols = matrix[0].length;
168	for (row = 0; row < nrows; row++) {
169	for (col = 0; col < ncols; col++) {
170	n += matrix[row][col];
171	}
172	}
173	min = nrows < ncols ? nrows-1 : ncols-1;
174	if ((min == 0) \|\| Utils.eq(n, 0))
175	return 0;
176	return Math.sqrt(chiVal(matrix, false) / (n * (double)min));
177	}
178
179	/**
180	* Computes the entropy of the given array.
181	*
182	* @param array the array
183	* @return the entropy
184	*/
185	public static double entropy(double[] array) {
186
187	double returnValue = 0, sum = 0;
188
189	for (int i = 0; i < array.length; i++) {
190	returnValue -= lnFunc(array[i]);
191	sum += array[i];
192	}
193	if (Utils.eq(sum, 0)) {
194	return 0;
195	} else {
196	return (returnValue + lnFunc(sum)) / (sum * log2);
197	}
198	}
199
200	/**
201	* Computes conditional entropy of the rows given
202	* the columns.
203	*
204	* @param matrix the contingency table
205	* @return the conditional entropy of the rows given the columns
206	*/
207	public static double entropyConditionedOnColumns(double[][] matrix) {
208
209	double returnValue = 0, sumForColumn, total = 0;
210
211	for (int j = 0; j < matrix[0].length; j++) {
212	sumForColumn = 0;
213	for (int i = 0; i < matrix.length; i++) {
214	returnValue = returnValue + lnFunc(matrix[i][j]);
215	sumForColumn += matrix[i][j];
216	}
217	returnValue = returnValue - lnFunc(sumForColumn);
218	total += sumForColumn;
219	}
220	if (Utils.eq(total, 0)) {
221	return 0;
222	}
223	return -returnValue / (total * log2);
224	}
225
226	/**
227	* Computes conditional entropy of the columns given
228	* the rows.
229	*
230	* @param matrix the contingency table
231	* @return the conditional entropy of the columns given the rows
232	*/
233	public static double entropyConditionedOnRows(double[][] matrix) {
234
235	double returnValue = 0, sumForRow, total = 0;
236
237	for (int i = 0; i < matrix.length; i++) {
238	sumForRow = 0;
239	for (int j = 0; j < matrix[0].length; j++) {
240	returnValue = returnValue + lnFunc(matrix[i][j]);
241	sumForRow += matrix[i][j];
242	}
243	returnValue = returnValue - lnFunc(sumForRow);
244	total += sumForRow;
245	}
246	if (Utils.eq(total, 0)) {
247	return 0;
248	}
249	return -returnValue / (total * log2);
250	}
251
252	/**
253	* Computes conditional entropy of the columns given the rows
254	* of the test matrix with respect to the train matrix. Uses a
255	* Laplace prior. Does NOT normalize the entropy.
256	*
257	* @param train the train matrix
258	* @param test the test matrix
259	* @param the number of symbols for Laplace
260	* @return the entropy
261	*/
262	public static double entropyConditionedOnRows(double[][] train,
263	double[][] test,
264	double numClasses) {
265
266	double returnValue = 0, trainSumForRow, testSumForRow, testSum = 0;
267
268	for (int i = 0; i < test.length; i++) {
269	trainSumForRow = 0;
270	testSumForRow = 0;
271	for (int j = 0; j < test[0].length; j++) {
272	returnValue -= test[i][j] * Math.log(train[i][j] + 1);
273	trainSumForRow += train[i][j];
274	testSumForRow += test[i][j];
275	}
276	testSum = testSumForRow;
277	returnValue += testSumForRow * Math.log(trainSumForRow +
278	numClasses);
279	}
280	return returnValue / (testSum * log2);
281	}
282
283	/**
284	* Computes the rows' entropy for the given contingency table.
285	*
286	* @param matrix the contingency table
287	* @return the rows' entropy
288	*/
289	public static double entropyOverRows(double[][] matrix) {
290
291	double returnValue = 0, sumForRow, total = 0;
292
293	for (int i = 0; i < matrix.length; i++) {
294	sumForRow = 0;
295	for (int j = 0; j < matrix[0].length; j++) {
296	sumForRow += matrix[i][j];
297	}
298	returnValue = returnValue - lnFunc(sumForRow);
299	total += sumForRow;
300	}
301	if (Utils.eq(total, 0)) {
302	return 0;
303	}
304	return (returnValue + lnFunc(total)) / (total * log2);
305	}
306
307	/**
308	* Computes the columns' entropy for the given contingency table.
309	*
310	* @param matrix the contingency table
311	* @return the columns' entropy
312	*/
313	public static double entropyOverColumns(double[][] matrix){
314
315	double returnValue = 0, sumForColumn, total = 0;
316
317	for (int j = 0; j < matrix[0].length; j++){
318	sumForColumn = 0;
319	for (int i = 0; i < matrix.length; i++) {
320	sumForColumn += matrix[i][j];
321	}
322	returnValue = returnValue - lnFunc(sumForColumn);
323	total += sumForColumn;
324	}
325	if (Utils.eq(total, 0)) {
326	return 0;
327	}
328	return (returnValue + lnFunc(total)) / (total * log2);
329	}
330
331	/**
332	* Computes gain ratio for contingency table (split on rows).
333	* Returns Double.MAX_VALUE if the split entropy is 0.
334	*
335	* @param matrix the contingency table
336	* @return the gain ratio
337	*/
338	public static double gainRatio(double[][] matrix){
339
340	double preSplit = 0, postSplit = 0, splitEnt = 0,
341	sumForRow, sumForColumn, total = 0, infoGain;
342
343	// Compute entropy before split
344	for (int i = 0; i < matrix[0].length; i++) {
345	sumForColumn = 0;
346	for (int j = 0; j < matrix.length; j++)
347	sumForColumn += matrix[j][i];
348	preSplit += lnFunc(sumForColumn);
349	total += sumForColumn;
350	}
351	preSplit -= lnFunc(total);
352
353	// Compute entropy after split and split entropy
354	for (int i = 0; i < matrix.length; i++) {
355	sumForRow = 0;
356	for (int j = 0; j < matrix[0].length; j++) {
357	postSplit += lnFunc(matrix[i][j]);
358	sumForRow += matrix[i][j];
359	}
360	splitEnt += lnFunc(sumForRow);
361	}
362	postSplit -= splitEnt;
363	splitEnt -= lnFunc(total);
364
365	infoGain = preSplit - postSplit;
366	if (Utils.eq(splitEnt, 0))
367	return 0;
368	return infoGain / splitEnt;
369	}
370
371	/**
372	* Returns negative base 2 logarithm of multiple hypergeometric
373	* probability for a contingency table.
374	*
375	* @param matrix the contingency table
376	* @return the log of the hypergeometric probability of the contingency table
377	*/
378	public static double log2MultipleHypergeometric(double[][] matrix) {
379
380	double sum = 0, sumForRow, sumForColumn, total = 0;
381
382	for (int i = 0; i < matrix.length; i++) {
383	sumForRow = 0;
384	for (int j = 0; j < matrix[i].length; j++) {
385	sumForRow += matrix[i][j];
386	}
387	sum += SpecialFunctions.lnFactorial(sumForRow);
388	total += sumForRow;
389	}
390	for (int j = 0; j < matrix[0].length; j++) {
391	sumForColumn = 0;
392	for (int i = 0; i < matrix.length; i++) {
393	sumForColumn += matrix [i][j];
394	}
395	sum += SpecialFunctions.lnFactorial(sumForColumn);
396	}
397	for (int i = 0; i < matrix.length; i++) {
398	for (int j = 0; j < matrix[i].length; j++) {
399	sum -= SpecialFunctions.lnFactorial(matrix[i][j]);
400	}
401	}
402	sum -= SpecialFunctions.lnFactorial(total);
403	return -sum / log2;
404	}
405
406	/**
407	* Reduces a matrix by deleting all zero rows and columns.
408	*
409	* @param matrix the matrix to be reduced
410	* @param the matrix with all zero rows and columns deleted
411	*/
412	public static double[][] reduceMatrix(double[][] matrix) {
413
414	int row, col, currCol, currRow, nrows, ncols,
415	nonZeroRows = 0, nonZeroColumns = 0;
416	double[] rtotal, ctotal;
417	double[][] newMatrix;
418
419	nrows = matrix.length;
420	ncols = matrix[0].length;
421	rtotal = new double [nrows];
422	ctotal = new double [ncols];
423	for (row = 0; row < nrows; row++) {
424	for (col = 0; col < ncols; col++) {
425	rtotal[row] += matrix[row][col];
426	ctotal[col] += matrix[row][col];
427	}
428	}
429	for (row = 0; row < nrows; row++) {
430	if (Utils.gr(rtotal[row],0)) {
431	nonZeroRows++;
432	}
433	}
434	for (col = 0; col < ncols; col++) {
435	if (Utils.gr(ctotal[col],0)) {
436	nonZeroColumns++;
437	}
438	}
439	newMatrix = new double[nonZeroRows][nonZeroColumns];
440	currRow = 0;
441	for (row = 0; row < nrows; row++) {
442	if (Utils.gr(rtotal[row],0)) {
443	currCol = 0;
444	for (col = 0; col < ncols; col++) {
445	if (Utils.gr(ctotal[col],0)) {
446	newMatrix[currRow][currCol] = matrix[row][col];
447	currCol++;
448	}
449	}
450	currRow++;
451	}
452	}
453	return newMatrix;
454	}
455
456	/**
457	* Calculates the symmetrical uncertainty for base 2.
458	*
459	* @param matrix the contingency table
460	* @return the calculated symmetrical uncertainty
461	*
462	*/
463	public static double symmetricalUncertainty(double matrix[][]) {
464
465	double sumForColumn, sumForRow, total = 0, columnEntropy = 0,
466	rowEntropy = 0, entropyConditionedOnRows = 0, infoGain = 0;
467
468	// Compute entropy for columns
469	for (int i = 0; i < matrix[0].length; i++) {
470	sumForColumn = 0;
471	for (int j = 0; j < matrix.length; j++) {
472	sumForColumn += matrix[j][i];
473	}
474	columnEntropy += lnFunc(sumForColumn);
475	total += sumForColumn;
476	}
477	columnEntropy -= lnFunc(total);
478
479	// Compute entropy for rows and conditional entropy
480	for (int i = 0; i < matrix.length; i++) {
481	sumForRow = 0;
482	for (int j = 0; j < matrix[0].length; j++) {
483	sumForRow += matrix[i][j];
484	entropyConditionedOnRows += lnFunc(matrix[i][j]);
485	}
486	rowEntropy += lnFunc(sumForRow);
487	}
488	entropyConditionedOnRows -= rowEntropy;
489	rowEntropy -= lnFunc(total);
490	infoGain = columnEntropy - entropyConditionedOnRows;
491	if (Utils.eq(columnEntropy, 0) \|\| Utils.eq(rowEntropy, 0))
492	return 0;
493	return 2.0 * (infoGain / (columnEntropy + rowEntropy));
494	}
495
496	/**
497	* Computes Goodman and Kruskal's tau-value for a contingency table.
498	*
499	* @param matrix the contingency table
500	* @param Goodman and Kruskal's tau-value
501	*/
502	public static double tauVal(double[][] matrix) {
503
504	int nrows, ncols, row, col;
505	double [] ctotal;
506	double maxcol = 0, max, maxtotal = 0, n = 0;
507
508	nrows = matrix.length;
509	ncols = matrix[0].length;
510	ctotal = new double [ncols];
511	for (row = 0; row < nrows; row++) {
512	max = 0;
513	for (col = 0; col < ncols; col++) {
514	if (Utils.gr(matrix[row][col], max))
515	max = matrix[row][col];
516	ctotal[col] += matrix[row][col];
517	n += matrix[row][col];
518	}
519	maxtotal += max;
520	}
521	if (Utils.eq(n, 0)) {
522	return 0;
523	}
524	maxcol = ctotal[Utils.maxIndex(ctotal)];
525	return (maxtotal - maxcol)/(n - maxcol);
526	}
527
528	/**
529	* Help method for computing entropy.
530	*/
531	private static double lnFunc(double num){
532
533	// Constant hard coded for efficiency reasons
534	if (num < 1e-6) {
535	return 0;
536	} else {
537	return num * Math.log(num);
538	}
539	}
540
541	/**
542	* Computes chi-value for one cell in matrix.
543	* From Gary Perlman's unixstat.
544	*/
545	private static double chiCell(double freq, double expect, boolean yates){
546
547	double diff = freq - expect;
548
549	if (yates) {
550	diff = Math.abs (diff) - 0.5;
551	if (diff < 0.0) { // over-correction
552	diff = 0.0;
553	}
554	}
555	if (Math.abs(expect) < 10e-10) {
556	return (0.0);
557	} else {
558	return (diff * diff / expect);
559	}
560	}
561
562	/**
563	* Main method for testing this class.
564	*/
565	public static void main(String[] ops) {
566
567	double[] firstRow = {10, 5, 20};
568	double[] secondRow = {2, 10, 6};
569	double[] thirdRow = {5, 10, 10};
570	double[][] matrix = new double[3][0];
571
572	matrix[0] = firstRow; matrix[1] = secondRow; matrix[2] = thirdRow;
573	for (int i = 0; i < matrix.length; i++) {
574	for (int j = 0; j < matrix[i].length; j++) {
575	System.out.print(matrix[i][j] + " ");
576	}
577	System.out.println();
578	}
579	System.out.println("Chi-squared probability: " +
580	ContingencyTables.chiSquared(matrix, false));
581	System.out.println("Chi-squared value: " +
582	ContingencyTables.chiVal(matrix, false));
583	System.out.println("Cochran's criterion fullfilled: " +
584	ContingencyTables.cochransCriterion(matrix));
585	System.out.println("Cramer's V: " +
586	ContingencyTables.CramersV(matrix));
587	System.out.println("Entropy of first row: " +
588	ContingencyTables.entropy(firstRow));
589	System.out.println("Entropy conditioned on columns: " +
590	ContingencyTables.entropyConditionedOnColumns(matrix));
591	System.out.println("Entropy conditioned on rows: " +
592	ContingencyTables.entropyConditionedOnRows(matrix));
593	System.out.println("Entropy conditioned on rows (with Laplace): " +
594	ContingencyTables.entropyConditionedOnRows(matrix, matrix, 3));
595	System.out.println("Entropy of rows: " +
596	ContingencyTables.entropyOverRows(matrix));
597	System.out.println("Entropy of columns: " +
598	ContingencyTables.entropyOverColumns(matrix));
599	System.out.println("Gain ratio: " +
600	ContingencyTables.gainRatio(matrix));
601	System.out.println("Negative log2 of multiple hypergeometric probability: " +
602	ContingencyTables.log2MultipleHypergeometric(matrix));
603	System.out.println("Symmetrical uncertainty: " +
604	ContingencyTables.symmetricalUncertainty(matrix));
605	System.out.println("Tau value: " +
606	ContingencyTables.tauVal(matrix));
607	double[][] newMatrix = new double[3][3];
608	newMatrix[0][0] = 1; newMatrix[0][1] = 0; newMatrix[0][2] = 1;
609	newMatrix[1][0] = 0; newMatrix[1][1] = 0; newMatrix[1][2] = 0;
610	newMatrix[2][0] = 1; newMatrix[2][1] = 0; newMatrix[2][2] = 1;
611	System.out.println("Matrix with empty row and column: ");
612	for (int i = 0; i < newMatrix.length; i++) {
613	for (int j = 0; j < newMatrix[i].length; j++) {
614	System.out.print(newMatrix[i][j] + " ");
615	}
616	System.out.println();
617	}
618	System.out.println("Reduced matrix: ");
619	newMatrix = ContingencyTables.reduceMatrix(newMatrix);
620	for (int i = 0; i < newMatrix.length; i++) {
621	for (int j = 0; j < newMatrix[i].length; j++) {
622	System.out.print(newMatrix[i][j] + " ");
623	}
624	System.out.println();
625	}
626	}
627	}
628
629
630
631
632
633
634
635

Note: See TracBrowser for help on using the repository browser.

Download in other formats: