source: trunk/greenstone3-extensions/vishnu/src/vishnu/cluster/Hierarchical.java@ 8189

Last change on this file since 8189 was 8189, checked in by kjdon, 20 years ago

first version of Imperial College's Visualiser code

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1package vishnu.cluster;
2
3public class Hierarchical
4{
5 public double[] minrow;
6 public int[] colind;
7 //public SimMatrix smatrix;
8 public TriangleIndex triangle;
9
10 public Hierarchical(int matrix_rows)
11 {
12 // create Triangle with "limit" arg = maximum #clusters
13 // an object to help with indexing the cells
14 // in a triangular matrix
15 triangle = new TriangleIndex(matrix_rows);
16 }
17
18 // set each minrow[i] to the value of the minimum element of that row
19 void set_minrow (SimMatrix m, int row)
20 {
21 // triangular matrix without diagonal, hence zeroth row is all zero
22 if(row == 0) return;
23
24 int i;
25 int index_start, index_end;
26
27 // first column
28 index_start = triangle.index (row, 0);
29 // last column (row(!)-1 because as many rows as columns)
30 index_end = triangle.index (row, row - 1);
31
32 minrow[row] = m.matrix[index_start];
33 colind[row] = index_start;
34
35 for (i = index_start + 1; i <= index_end; i++)
36 {
37 if (m.matrix[i] < minrow[row])
38 {
39 minrow[row] = m.matrix[i];
40 colind[row] = i;
41 }
42 }
43 }
44
45 // return the ROW in which the minimum disimilarity resides
46 int get_minrow (int last_row)
47 {
48 int i;
49 int row;
50 double value;
51
52 row = 1;
53 value = minrow[1];
54
55 for (i = 2; i <= last_row; i++)
56 {
57 if (minrow[i] < value)
58 {
59 row = i;
60 value = minrow[i];
61 }
62 }
63
64 return row;
65 }
66
67 // num_of_rows is "limit" = max #clusters
68 Cluster clustering (SimMatrix sim_matrix, int num_of_rows,
69 double[] heights, LinkFunc linkage_func)
70 {
71 Cluster child_1;
72 Cluster child_2;
73 Cluster parent=null;
74 ClusterArray cluster_ary;
75
76
77 int min_row, index, row, col, i;
78 double sim;
79
80 int last_row_index, sim_matrix_size;
81
82 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
83
84 // Place each of the documents in a cluster of each own
85 // there are vector.length docs but only num_of_rows = limit places in the array
86 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
87 cluster_ary = new ClusterArray (num_of_rows);
88 last_row_index = num_of_rows - 1;
89
90 // a variable that is a double and exists for each row - what can this be?
91 // the lowest dissimilarity score?
92 minrow = new double[num_of_rows];
93
94 // a variable that is an int and exists for each row - what can that be?
95 // the document index of the document with that lowest score?
96 colind = new int[num_of_rows];
97
98 for (i = 1; i < num_of_rows; i++)
99 {
100 set_minrow(sim_matrix, i);
101 }
102
103 // do all the hierarchical clustering by running over all rows, updating the matrix
104 // as a new cluster is formed each time
105 for (i = 0; i < num_of_rows - 1; i++)
106 {
107 // find the row with the lowest disimilarity
108 min_row = get_minrow(last_row_index);
109 sim = minrow[min_row];
110
111 // the column with the lowest dis... (sim) in that row
112 index = colind[min_row];
113
114 // array of heights, DESCENDING disim, i.e. sim goes last
115 heights[num_of_rows - 2 - i] = sim;
116
117 row = triangle.i_ind[index];
118 col = triangle.j_ind[index];
119
120 merge (row, col, sim_matrix, last_row_index, linkage_func);
121
122 sim_matrix_size -= last_row_index;
123
124 child_1 = (Cluster)cluster_ary.contents.elementAt(row);
125 child_2 = (Cluster)cluster_ary.contents.elementAt(col);
126
127 // parents get id = -1 to indicate that they are not representing single docs
128 // all leaves have an id (1 - num_rows) that points to the doc they contain
129 parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items);
130
131 child_1._parent = parent;
132 child_2._parent = parent;
133
134 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
135 cluster_ary.contents.setElementAt(parent,col);
136
137 last_row_index--;
138 }
139
140 return parent;
141
142 }
143
144 // form a new cluster from the two most similar documents
145 void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func)
146 {
147 /* i and j are the matrix elements with maximum similarity */
148 /* matrix is the triangle similarity matrix with last row index n */
149 /* linkage_func is a pointer to the linkage function */
150
151 /* i > j */
152
153 int c, k, index1, index2;
154
155 // go over all rows
156 for (k = 0; k <= n; k++)
157 {
158 // if k < j the affected elements are on row j
159 if (k < j)
160 {
161 index1 = triangle.index (k, j);
162 index2 = triangle.index (k, i);
163 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
164
165 //printf ("1 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
166 }
167 else if (k > j && k < i)
168 {
169 /* if j < k < i then only one element is updated per row */
170 index1 = triangle.index (k, j);
171 index2 = triangle.index (k, i);
172 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
173 //printf ("2 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, m.matrix[index2]);
174
175 if (index1 == colind[k])
176 {
177 /* the previous minimum of the row has been changed so recaculate minimum of whole row */
178 set_minrow(m, k);
179 }
180 else
181 {
182 /* if the changed element is smaller than the current minimum update minrow and colind */
183 if (m.matrix[index1] < minrow[k])
184 {
185 minrow[k] = m.matrix[index1];
186 colind[k] = index1;
187 }
188 }
189 }
190 else if (k > i && k < n)
191 {
192 index1 = triangle.index (k, j);
193 index2 = triangle.index (k, i);
194 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
195 //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
196
197 if (index1 == colind[k])
198 {
199 /* the previous minimum of the row has been changed so recaculate minimum of whole row */
200 set_minrow(m, k);
201 }
202 else
203 {
204 /* if the changed element is smaller than the current minimum update minrow and colind */
205 if (m.matrix[index1] < minrow[k])
206 {
207 minrow[k] = m.matrix[index1];
208 colind[k] = index1;
209 }
210 }
211
212 index1 = triangle.index (k, i);
213 index2 = triangle.index (k, n);
214 m.matrix[index1] = m.matrix[index2];
215 //printf (" copy [%d, %d] to [%d, %d] = %f\n", k, n, k, i, matrix[index2]);
216
217 if (index1 == colind[k])
218 {
219 /* the previous minimum of the row has been changed so recaculate minimum of whole row */
220 set_minrow(m, k);
221 }
222 else
223 {
224 /* if the changed element is smaller than the current minimum update minrow and colind */
225 if (m.matrix[index1] < minrow[k])
226 {
227 minrow[k] = m.matrix[index1];
228 colind[k] = index1;
229 }
230 }
231 }
232 else if (k == n && i != n)
233 {
234 index1 = triangle.index (k, j);
235 index2 = triangle.index (k, i);
236 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
237 //printf ("5 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
238
239 if (index1 == colind[k])
240 {
241 /* the previous minimum of the row has been changed so recaculate minimum of whole row */
242 set_minrow(m, k);
243 }
244 else
245 {
246 /* if the changed element is smaller than the current minimum update minrow and colind */
247 if (m.matrix[index1] < minrow[k])
248 {
249 minrow[k] = m.matrix[index1];
250 colind[k] = index1;
251 }
252 }
253
254 for (c = 0; c < i; c++)
255 {
256 index1 = triangle.index (c, i);
257 index2 = triangle.index (c, n);
258 m.matrix[index1] = m.matrix[index2];
259 //printf ("copy [%d, %d] to [%d, %d] = %f\n", c, n, c, i, matrix[index2]);
260
261 }
262
263 }
264 }
265 set_minrow(m, i);
266 set_minrow(m, j);
267 }
268
269}
270
Note: See TracBrowser for help on using the repository browser.