1 | /* Related word hierarchical clustering using a given triangle similarity matrix */
|
---|
2 | #include <stdio.h>
|
---|
3 | #include <stdlib.h>
|
---|
4 | #include <math.h>
|
---|
5 | #include <assert.h>
|
---|
6 |
|
---|
7 | //#include <sys/times.h>
|
---|
8 | //#ifndef CLK_TCK
|
---|
9 | #include <time.h>
|
---|
10 | //#endif
|
---|
11 |
|
---|
12 | #include "clustering.h"
|
---|
13 | #include "create_trisim.h"
|
---|
14 | #include "simfuncs.h"
|
---|
15 | #include "iterative.h"
|
---|
16 | #include "linkfuncs.h"
|
---|
17 | #include "hierarchical.h"
|
---|
18 | #include "procresults.h"
|
---|
19 | #include "mysammon.h"
|
---|
20 |
|
---|
21 |
|
---|
22 | void *myrealloc(void *p, int n) {
|
---|
23 | void *ret = p? realloc(p, n): mymalloc(n);
|
---|
24 |
|
---|
25 | if(!ret)
|
---|
26 | fprintf(stderr, "clustering.c: no memory for realloc(p,%d)\n", n), exit(1);
|
---|
27 |
|
---|
28 | return ret;
|
---|
29 | }
|
---|
30 |
|
---|
31 | void *mycalloc(int n, int m) {
|
---|
32 | void *ret = calloc(n, m);
|
---|
33 |
|
---|
34 | if(!ret)
|
---|
35 | fprintf(stderr, "clustering.c: no memory for calloc(%d,%d)\n", n, m), exit(1);
|
---|
36 |
|
---|
37 | return ret;
|
---|
38 | }
|
---|
39 |
|
---|
40 | void *mymalloc(int n) {
|
---|
41 | void *ret = malloc(n);
|
---|
42 |
|
---|
43 | if(!ret)
|
---|
44 | fprintf(stderr, "clustering.c: no memory for malloc(%d,%d)\n", n), exit(1);
|
---|
45 |
|
---|
46 | return ret;
|
---|
47 | }
|
---|
48 |
|
---|
49 | void myfree(void *p) {
|
---|
50 | if(p)
|
---|
51 | free(p);
|
---|
52 | }
|
---|
53 |
|
---|
54 | void free_vector_array(double **d,int n)
|
---|
55 | {
|
---|
56 | int i;
|
---|
57 | for(i=0;i<n;i++)
|
---|
58 | free(d[i]);
|
---|
59 | free(d);
|
---|
60 | }
|
---|
61 |
|
---|
62 | int VECTOR_SIZE;
|
---|
63 |
|
---|
64 | /*
|
---|
65 | * Main
|
---|
66 | */
|
---|
67 | /* 0 vectors_3333 15 1 10 20 */
|
---|
68 |
|
---|
69 | int
|
---|
70 | do_clustering (int linkage, double **vectors, int num_of_vectors, int arraySize, int min_c_items,
|
---|
71 | int target_clusters, int limit, int *coding, double ***centroidsPtr, char * buffer, int
|
---|
72 | *cursize, int * curptr)
|
---|
73 | {
|
---|
74 | // char * buffer = (char *) malloc (256);
|
---|
75 | // int cursize = 256;
|
---|
76 | // int curptr = 0;
|
---|
77 | char bbuf[20];
|
---|
78 | int n;
|
---|
79 |
|
---|
80 | double *sim_matrix, **centroids;
|
---|
81 | double (*linkage_func)(double, double);
|
---|
82 |
|
---|
83 | //struct tms *timer = (struct tms *) mymalloc(sizeof (struct tms));
|
---|
84 |
|
---|
85 | Cluster *toplevel_cluster;
|
---|
86 | Cluster **splitclusters = NULL;
|
---|
87 | int clusters = 0, zeros=0;
|
---|
88 |
|
---|
89 | double threshold;
|
---|
90 | double (*sim_func)(double *, double *, int) = cosine_distance;
|
---|
91 |
|
---|
92 | double *heights;
|
---|
93 |
|
---|
94 | int i, j;
|
---|
95 |
|
---|
96 | VECTOR_SIZE = arraySize;
|
---|
97 | switch (linkage) {
|
---|
98 | case COMPLETE_LINKAGE:
|
---|
99 | linkage_func = maxlink;
|
---|
100 | break;
|
---|
101 | case AVERAGE_LINKAGE:
|
---|
102 | linkage_func = avelink;
|
---|
103 | break;
|
---|
104 | case SINGLE_LINKAGE:
|
---|
105 | linkage_func = minlink;
|
---|
106 | break;
|
---|
107 | default:
|
---|
108 | printf ("clustering.c: main(): linkage type %d not defined.\n", linkage);
|
---|
109 | exit(0);
|
---|
110 | }
|
---|
111 |
|
---|
112 | if(limit > num_of_vectors) limit = num_of_vectors;
|
---|
113 |
|
---|
114 | sim_matrix = create_sim_matrix (vectors, limit, sim_func);
|
---|
115 |
|
---|
116 |
|
---|
117 | /* an array of heights sorted by descending disim */
|
---|
118 | heights = (double *) mycalloc ((limit - 1), sizeof (double));
|
---|
119 |
|
---|
120 | toplevel_cluster = hierarchical_clustering (sim_matrix, limit, heights, linkage_func);
|
---|
121 |
|
---|
122 |
|
---|
123 | threshold = split_clusters (toplevel_cluster, &splitclusters, heights, limit, target_clusters-zeros, min_c_items, &clusters);
|
---|
124 |
|
---|
125 | iterative_clustering (splitclusters, clusters, vectors, limit, num_of_vectors, sim_func);
|
---|
126 |
|
---|
127 | centroids = malloc(sizeof(double **)*clusters);
|
---|
128 | assert(centroids);
|
---|
129 |
|
---|
130 | n = sprintf(bbuf,"%d\n",clusters);
|
---|
131 | buffer = appendString(buffer,cursize,curptr,bbuf,strlen(bbuf));
|
---|
132 |
|
---|
133 | for (i = 0; i < clusters; i++)
|
---|
134 | {
|
---|
135 | centroids[i] = calloc(VECTOR_SIZE,sizeof(double));
|
---|
136 | assert(centroids[i]);
|
---|
137 | print_cluster_children (splitclusters[i], coding, buffer, cursize,curptr);
|
---|
138 | for (j = 0; j < splitclusters[i]->iter_items_num; j++)
|
---|
139 | {
|
---|
140 | if( j<splitclusters[i]->iter_items_num-1 )
|
---|
141 | {
|
---|
142 | n = sprintf(bbuf,"%d%s",coding[splitclusters[i]->iter_items[j]],",");
|
---|
143 | buffer = appendString(buffer,cursize,curptr,bbuf,strlen(bbuf));
|
---|
144 | }
|
---|
145 | else
|
---|
146 | {
|
---|
147 | n = sprintf(bbuf,"%d%s",coding[splitclusters[i]->iter_items[j]],"");
|
---|
148 | buffer = appendString(buffer,cursize,curptr,bbuf,strlen(bbuf));
|
---|
149 |
|
---|
150 | }
|
---|
151 | }
|
---|
152 |
|
---|
153 | for (j = 0; j < VECTOR_SIZE; j++)
|
---|
154 | {
|
---|
155 | centroids[i][j]=splitclusters[i]->centroid[j];
|
---|
156 | }
|
---|
157 |
|
---|
158 | buffer = appendString(buffer,cursize,curptr,"\n",1);
|
---|
159 | }
|
---|
160 |
|
---|
161 | free (sim_matrix);
|
---|
162 | free_cluster (toplevel_cluster);
|
---|
163 | free (heights);
|
---|
164 |
|
---|
165 | *centroidsPtr = centroids;
|
---|
166 | return clusters;
|
---|
167 | }
|
---|
168 |
|
---|
169 | int clustering (int linkage, MatrixNodePtr *matrix, int num_of_vectors, int arraySize, int min_c_items,
|
---|
170 | int target_clusters, int limit, char *buffer, int *cursize, int *curptr)
|
---|
171 | {
|
---|
172 | int *coding, *newcoding, clusterCount, groupCount, i;
|
---|
173 | double **vectors, **centroids, **newCentroids;
|
---|
174 | vectors = read_vectors (matrix, num_of_vectors, arraySize, &coding);
|
---|
175 |
|
---|
176 | clusterCount = do_clustering (linkage, vectors, num_of_vectors, arraySize, min_c_items,
|
---|
177 | target_clusters, limit, coding, ¢roids, buffer, cursize, curptr);
|
---|
178 |
|
---|
179 | mysammon( centroids, clusterCount, arraySize, buffer, cursize, curptr);
|
---|
180 |
|
---|
181 | free_vector_array(vectors,num_of_vectors);
|
---|
182 | free_vector_array(centroids,clusterCount);
|
---|
183 |
|
---|
184 |
|
---|
185 | free(coding);
|
---|
186 | return 0;
|
---|
187 |
|
---|
188 | }
|
---|
189 |
|
---|
190 |
|
---|
191 |
|
---|
192 |
|
---|
193 |
|
---|
194 |
|
---|
195 |
|
---|